Loading process_srl.py +45 −1 Original line number Diff line number Diff line Loading @@ -23,11 +23,55 @@ def get_dom(v): except KeyError: return None def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'}) return df, df.name_bias.to_dict() df = read_csvs('outputs') #pd.read_csv('vdberg_output.csv') pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) df = pd.read_csv('data/srl_output.csv', names=['name','verb','arg','same','sent_id']) #read_csvs('outputs') #pd.read_csv('vdberg_output.csv') sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) df = pd.merge(df, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) #df = df.drop(columns='same') df = df[df['arg'].str.startswith('B-ARG0')] print(df.columns) print(df.shape) #print(df.head(n=5)) #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() overall_counts = df.verb.value_counts() def test(v): return overall_counts[v] > 5 df = df[df.verb.apply(test)] def cust(x): return ' (' + str(round(x, 2)) + ')' grs = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): counts = gr.verb.value_counts() counts = counts / overall_counts.loc[counts.index] counts.index += counts.apply(cust) sorted_verbs = counts.sort_values(ascending=False).head(10) grs[n] = sorted_verbs.index print(grs) exit(0) nltk.download('wordnet') lemmatizer = WordNetLemmatizer() df['lemma'] = df.verb.apply(lemmatizer.lemmatize) Loading Loading
process_srl.py +45 −1 Original line number Diff line number Diff line Loading @@ -23,11 +23,55 @@ def get_dom(v): except KeyError: return None def load_nameslab(): namfn = 'data/names_labeled.csv' df = pd.read_csv(namfn, index_col=0) df = df.replace({'l': 'Left', 'r': 'Right', 'c':'Center'}) return df, df.name_bias.to_dict() df = read_csvs('outputs') #pd.read_csv('vdberg_output.csv') pd.set_option('display.max_colwidth', 200) pd.set_option('display.max_columns', 10) df = pd.read_csv('data/srl_output.csv', names=['name','verb','arg','same','sent_id']) #read_csvs('outputs') #pd.read_csv('vdberg_output.csv') sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s']) df = pd.merge(df, sent, on=['sent_id']) namlabs_df, namlabs = load_nameslab() df = pd.merge(df, namlabs_df, on=['name']) #df = df.drop(columns='same') df = df[df['arg'].str.startswith('B-ARG0')] print(df.columns) print(df.shape) #print(df.head(n=5)) #nltk.download('wordnet') #lemmatizer = WordNetLemmatizer() #df['lemma'] = df.verb.apply(lemmatizer.lemmatize) df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'}) df.verb = df.verb.str.lower() overall_counts = df.verb.value_counts() def test(v): return overall_counts[v] > 5 df = df[df.verb.apply(test)] def cust(x): return ' (' + str(round(x, 2)) + ')' grs = pd.DataFrame() for n, gr in df.groupby(['bias', 'name_bias']): counts = gr.verb.value_counts() counts = counts / overall_counts.loc[counts.index] counts.index += counts.apply(cust) sorted_verbs = counts.sort_values(ascending=False).head(10) grs[n] = sorted_verbs.index print(grs) exit(0) nltk.download('wordnet') lemmatizer = WordNetLemmatizer() df['lemma'] = df.verb.apply(lemmatizer.lemmatize) Loading