Loading process_srl.py +22 −32 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ from argparse import ArgumentParser def load_output(dir='outputs', combine=False): outfile = 'data/srl_output.csv' outfile = 'data/srl_preprocessed_output.csv' if combine: print('combining') Loading Loading @@ -34,6 +34,7 @@ def get_dom(v): except KeyError: return None def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) Loading @@ -41,53 +42,48 @@ def load_nameslab(): return df, df.name_bias.to_dict() # process command line arguments parser = ArgumentParser() parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not") parser.add_argument("-a", "--arg", default='', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show") args = parser.parse_args() df = load_output(dir='outputs', combine=args.combine) print('Output size {}:'.format(df.shape)) # load files sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) output = load_output(dir='outputs', combine=args.combine) df = df[df['name'] != 'Ryan'] print('Output size {}/{}:'.format(output.shape[0], sent.shape[0])) df = df[df['arg'].str.startswith(args.arg)] # filter output output = output[output['name'] != 'Ryan'] df = output[output['arg'].str.startswith(args.arg)] #print('Arg-0 only {}:'.format(df.shape)) sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) # combine output df = pd.merge(df, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() #df = df.drop(columns='same') # show what data is like pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) print(df.columns) print(df.shape) #print(df.head(n=5)) # lemmatize #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() # compute overall counts overall_counts = df.verb.value_counts() def test(v): return overall_counts[v] > 5 df = df[df.verb.apply(test)] lemmatizer = WordNetLemmatizer() df.verb = df.verb.apply(lemmatizer.lemmatize) def cust(x): return ' (' + str(x) + ')' grs = pd.DataFrame() binned = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"]) Loading @@ -99,21 +95,16 @@ for n, gr in df.groupby(['bias', 'name_bias']): #counts.normalized = counts.normalized.apply(round) #print(counts) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n) #print(sorted_verbs) print(sorted_verbs.head(2)) #exit(0) sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t") grs[n] = sorted_verbs.index binned[n] = sorted_verbs.index print(grs) grs.to_csv('frequent_verbs/most_freq_verbs.csv') print(binned.head()) binned.to_csv('frequent_verbs/most_freq_verbs.csv') exit(0) nltk.download('wordnet') lemmatizer = WordNetLemmatizer() df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df.arg = df.arg.str[2:] vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0) df['dom'] = df.lemma.apply(get_dom) Loading @@ -121,11 +112,10 @@ grs = df.drop_duplicates().groupby('same') # symptoms of agency arg_agr = pd.DataFrame(columns=['']) dom_agr = pd.DataFrame(columns=['']) for n, gr in grs: print(n, gr.dom.mean()) arg_agr[n] = gr['arg'].value_counts(normalize=True) #dom_agr[n] = gr.nlargest(n=10, columns=['dom']) dom_agr[n] = gr.nlargest(n=10, columns=['dom']) print(arg_agr.head(n=10)) #print(dom_agr.head(n=10)) Loading
process_srl.py +22 −32 Original line number Diff line number Diff line Loading @@ -6,7 +6,7 @@ from argparse import ArgumentParser def load_output(dir='outputs', combine=False): outfile = 'data/srl_output.csv' outfile = 'data/srl_preprocessed_output.csv' if combine: print('combining') Loading Loading @@ -34,6 +34,7 @@ def get_dom(v): except KeyError: return None def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) Loading @@ -41,53 +42,48 @@ def load_nameslab(): return df, df.name_bias.to_dict() # process command line arguments parser = ArgumentParser() parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not") parser.add_argument("-a", "--arg", default='', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show") args = parser.parse_args() df = load_output(dir='outputs', combine=args.combine) print('Output size {}:'.format(df.shape)) # load files sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) output = load_output(dir='outputs', combine=args.combine) df = df[df['name'] != 'Ryan'] print('Output size {}/{}:'.format(output.shape[0], sent.shape[0])) df = df[df['arg'].str.startswith(args.arg)] # filter output output = output[output['name'] != 'Ryan'] df = output[output['arg'].str.startswith(args.arg)] #print('Arg-0 only {}:'.format(df.shape)) sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) # combine output df = pd.merge(df, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() #df = df.drop(columns='same') # show what data is like pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) print(df.columns) print(df.shape) #print(df.head(n=5)) # lemmatize #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() # compute overall counts overall_counts = df.verb.value_counts() def test(v): return overall_counts[v] > 5 df = df[df.verb.apply(test)] lemmatizer = WordNetLemmatizer() df.verb = df.verb.apply(lemmatizer.lemmatize) def cust(x): return ' (' + str(x) + ')' grs = pd.DataFrame() binned = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"]) Loading @@ -99,21 +95,16 @@ for n, gr in df.groupby(['bias', 'name_bias']): #counts.normalized = counts.normalized.apply(round) #print(counts) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n) #print(sorted_verbs) print(sorted_verbs.head(2)) #exit(0) sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t") grs[n] = sorted_verbs.index binned[n] = sorted_verbs.index print(grs) grs.to_csv('frequent_verbs/most_freq_verbs.csv') print(binned.head()) binned.to_csv('frequent_verbs/most_freq_verbs.csv') exit(0) nltk.download('wordnet') lemmatizer = WordNetLemmatizer() df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df.arg = df.arg.str[2:] vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0) df['dom'] = df.lemma.apply(get_dom) Loading @@ -121,11 +112,10 @@ grs = df.drop_duplicates().groupby('same') # symptoms of agency arg_agr = pd.DataFrame(columns=['']) dom_agr = pd.DataFrame(columns=['']) for n, gr in grs: print(n, gr.dom.mean()) arg_agr[n] = gr['arg'].value_counts(normalize=True) #dom_agr[n] = gr.nlargest(n=10, columns=['dom']) dom_agr[n] = gr.nlargest(n=10, columns=['dom']) print(arg_agr.head(n=10)) #print(dom_agr.head(n=10))