Loading process_srl.py +46 −23 Original line number Diff line number Diff line import pandas as pd import nltk from nltk.stem import WordNetLemmatizer import os import os, re from argparse import ArgumentParser import spacy def load_output(dir='outputs', combine=False): Loading @@ -17,7 +18,7 @@ def load_output(dir='outputs', combine=False): with open(os.path.join(p, i), encoding='utf-8') as i: all_lines.extend(i.readlines()[1:]) print('from {} file {} lines'.format(len(fs), len(all_lines))) print('from {} files {} lines'.format(len(fs), len(all_lines))) with open(outfile, 'w', encoding='utf-8') as o_add: for line in all_lines: Loading @@ -41,66 +42,88 @@ def load_nameslab(): df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'}) return df, df.name_bias.to_dict() namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) # process command line arguments parser = ArgumentParser() parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not") parser.add_argument("-a", "--arg", default='', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show") parser.add_argument("-a", "--arg", default='ARG0', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show") parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs") args = parser.parse_args() MIN_FREQ = args.min_freq N = args.n # load files sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) sent = sent[sent.s.apply(crude_ner) == True] output = load_output(dir='outputs', combine=args.combine) print('Output size {}/{}:'.format(output.shape[0], sent.shape[0])) # filter output output = output[output['name'] != 'Ryan'] df = output[output['arg'].str.startswith(args.arg)] #print('Arg-0 only {}:'.format(df.shape)) # combine output df = pd.merge(df, sent, on=['sent_id']) df = pd.merge(output, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() #df = df.drop(columns='same') print(df.shape) # show what data is like # display options pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) print(df.columns) print(df.shape) pd.set_option('display.width', 1000) #print(df.columns) #print(df.head(n=5)) # lemmatize def lemma(x): return nlp(x)[0].lemma_ lemmatize = False if lemmatize: nlp = spacy.load('en') df.verb = df.verb.apply(lemma) #['lemma'] df.to_csv('frequent_verbs/lemmatized.csv') else: df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0) #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) #df.verb = df.verb.apply(lemmatizer.lemmatize) # compute overall counts overall_counts = df.verb.value_counts() # arg df = df[df.arg.str.endswith('0')] df.verb += ' (' + df.arg.str[2:] + ')' # compute overall counts overall_freq = df.verb.value_counts() df["across"] = overall_freq.loc[df.verb].values df = df[df.across > MIN_FREQ] binned = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"]) #print(counts.columns) counts["across"] = overall_counts.loc[counts.index] #print(counts) counts["normalized"] = counts.verb / counts.across #print(counts) counts = pd.DataFrame(gr.verb.value_counts()) counts.columns = ['within'] counts["across"] = overall_freq.loc[counts.index].values counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True) #counts.normalized = counts.normalized.apply(round) #print(counts) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n) print(sorted_verbs.head(2)) #exit(0) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N) sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t") binned[n] = sorted_verbs.index print(binned.head()) print(binned.head(N)) binned.to_csv('frequent_verbs/most_freq_verbs.csv') exit(0) Loading Loading
process_srl.py +46 −23 Original line number Diff line number Diff line import pandas as pd import nltk from nltk.stem import WordNetLemmatizer import os import os, re from argparse import ArgumentParser import spacy def load_output(dir='outputs', combine=False): Loading @@ -17,7 +18,7 @@ def load_output(dir='outputs', combine=False): with open(os.path.join(p, i), encoding='utf-8') as i: all_lines.extend(i.readlines()[1:]) print('from {} file {} lines'.format(len(fs), len(all_lines))) print('from {} files {} lines'.format(len(fs), len(all_lines))) with open(outfile, 'w', encoding='utf-8') as o_add: for line in all_lines: Loading @@ -41,66 +42,88 @@ def load_nameslab(): df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'}) return df, df.name_bias.to_dict() namlabs_df, namlabs = load_nameslab() def crude_ner(sent): global namlabs nampat = '\\b' + '\\b|\\b'.join([i for i in namlabs]) + '\\b' m = re.search(nampat, sent) return bool(m) # process command line arguments parser = ArgumentParser() parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not") parser.add_argument("-a", "--arg", default='', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=100, help="nr of top verbs to show") parser.add_argument("-a", "--arg", default='ARG0', help="focus on an arg") parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show") parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs") args = parser.parse_args() MIN_FREQ = args.min_freq N = args.n # load files sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) sent = sent[sent.s.apply(crude_ner) == True] output = load_output(dir='outputs', combine=args.combine) print('Output size {}/{}:'.format(output.shape[0], sent.shape[0])) # filter output output = output[output['name'] != 'Ryan'] df = output[output['arg'].str.startswith(args.arg)] #print('Arg-0 only {}:'.format(df.shape)) # combine output df = pd.merge(df, sent, on=['sent_id']) df = pd.merge(output, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() #df = df.drop(columns='same') print(df.shape) # show what data is like # display options pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) print(df.columns) print(df.shape) pd.set_option('display.width', 1000) #print(df.columns) #print(df.head(n=5)) # lemmatize def lemma(x): return nlp(x)[0].lemma_ lemmatize = False if lemmatize: nlp = spacy.load('en') df.verb = df.verb.apply(lemma) #['lemma'] df.to_csv('frequent_verbs/lemmatized.csv') else: df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0) #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) #df.verb = df.verb.apply(lemmatizer.lemmatize) # compute overall counts overall_counts = df.verb.value_counts() # arg df = df[df.arg.str.endswith('0')] df.verb += ' (' + df.arg.str[2:] + ')' # compute overall counts overall_freq = df.verb.value_counts() df["across"] = overall_freq.loc[df.verb].values df = df[df.across > MIN_FREQ] binned = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1]) counts = pd.DataFrame(gr.verb.value_counts())#, columns=["within"]) #print(counts.columns) counts["across"] = overall_counts.loc[counts.index] #print(counts) counts["normalized"] = counts.verb / counts.across #print(counts) counts = pd.DataFrame(gr.verb.value_counts()) counts.columns = ['within'] counts["across"] = overall_freq.loc[counts.index].values counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True) #counts.normalized = counts.normalized.apply(round) #print(counts) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(args.n) print(sorted_verbs.head(2)) #exit(0) sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N) sorted_verbs.to_csv("frequent_verbs/{}.csv".format(n_edit), sep="\t") binned[n] = sorted_verbs.index print(binned.head()) print(binned.head(N)) binned.to_csv('frequent_verbs/most_freq_verbs.csv') exit(0) Loading