Commit da33038d authored by vdberg's avatar vdberg
Browse files

documentation

parent 29ed661c
Loading
Loading
Loading
Loading

data/lemmatized.csv

0 → 100644
+8062 −0

File added.

File size exceeds preview limit.

+0 −0

File moved.

+8816 −0

File added.

File size exceeds preview limit.

+48 −31
Original line number Diff line number Diff line
@@ -6,8 +6,8 @@ from argparse import ArgumentParser
import spacy


def load_output(dir='outputs', combine=False):
    outfile = 'data/srl_preprocessed_output.csv'
def load_raw_output(dir='outputs', combine=False):
    outfile = 'outputs/srl_ner_tagged_preprocessed.csv'

    if combine:
        print('combining')
@@ -51,7 +51,9 @@ def crude_ner(sent):
    m = re.search(nampat, sent)
    return bool(m)


# process command line arguments

parser = ArgumentParser()
parser.add_argument("-c", "--combine", action="store_true", default=False, help="whether to combine or not")
parser.add_argument("-a", "--arg", default=None, help="analyze args or not")
@@ -60,84 +62,97 @@ parser.add_argument("-min", "--min_freq", type=int, default=100, help="min verb
parser.add_argument("-max", "--max_freq", type=int, default=2000, help="min verb frequency to eliminate rarest verbs")
parser.add_argument("-l", "--lemmatize", action="store_true", default=False, help="whether to (re)do lemmatization")
args = parser.parse_args()

MIN_FREQ = args.min_freq
MAX_FREQ = args.max_freq
N = args.n
ARG = args.arg
LEMMATIZE = args.lemmatize
print(args)

# load files
sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])

# create srl tagged df files

# load inputs

sent = pd.read_csv('data/sentence_splitted.csv', names=['sent_id','bias','doc_id','s'])
sent = sent[sent.s.apply(crude_ner) == True]
output = load_output(dir='outputs', combine=args.combine)

print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))
namlabs_df, namlabs = load_nameslab()

# filter output
output = load_raw_output(dir='outputs', combine=args.combine)
output = output[output['name'] != 'Ryan']
#print('Arg-0 only {}:'.format(df.shape))
print('Output size {}/{}:'.format(output.shape[0], sent.shape[0]))

# merge & clean-up

# combine output
df = pd.merge(output, sent, on=['sent_id'])
namlabs_df, namlabs = load_nameslab()
df = pd.merge(df, namlabs_df, on=['name'])

df = df.replace({'From the Center': 'Center', 'From the Right': 'Right', 'From the Left': 'Left'})
df.verb = df.verb.str.lower()
#df = df.drop(columns='same')
print(df.shape)

# display options
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
#print(df.columns)
#print(df.head(n=5))

# lemmatize

def lemma(x):
    return nlp(x)[0].lemma_

if LEMMATIZE:
    nlp = spacy.load('en')
    df.verb = df.verb.apply(lemma) #['lemma']
    df.to_csv('frequent_verbs/lemmatized.csv')
    df.to_csv('data/lemmatized.csv')
else:
    df = pd.read_csv('frequent_verbs/lemmatized.csv', index_col=0)
    df = pd.read_csv('data/lemmatized.csv', index_col=0)

#nltk.download('wordnet')
#lemmatizer = WordNetLemmatizer()
#df.verb = df.verb.apply(lemmatizer.lemmatize)
# focus on arg if wanted

# arg
if ARG:
    #df = df[df.arg.str.endswith(ARG)]
    df.verb += ' (' + df.arg.str[2:] + ')'

# compute overall counts

# compute overall counts and filter out rare words and stop words

overall_freq = df.verb.value_counts()
print(overall_freq.head(3))
print(overall_freq.tail(3))
df["across"] = overall_freq.loc[df.verb].values
df = df[df.across > MIN_FREQ]
df = df[df.across < MAX_FREQ]

# group and count frequent verbs per bin

binned = pd.DataFrame()
for n, gr in df.groupby(['bias', 'name_bias']):
    n_edit = '{}_coverage_of_{}_person'.format(n[0], n[1])

    # make counts
    counts = pd.DataFrame(gr.verb.value_counts())
    counts.columns = ['within']
    counts["across"] = overall_freq.loc[counts.index].values

    # normalize & sort
    counts['normalized'] = counts.within / counts.across #gr.verb.value_counts(normalize=True)
    #counts.normalized = counts.normalized.apply(round)
    sorted_verbs = counts.sort_values(by="normalized", ascending=False).head(N)
    sorted_verbs.to_csv("frequent_verbs/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
    #counts.normalized = counts.normalized.apply(round)

    # store
    sorted_verbs.to_csv("verb_lists/{}{}.csv".format(n_edit, '_arg' if ARG else ''), sep="\t")
    binned[n] = sorted_verbs.index

# display

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
#print(df.columns)
#print(df.head(n=5))

print(binned.head(N))
binned.to_csv('frequent_verbs/most_freq_verbs{}.csv'.format('_arg' if ARG else ''))

exit(0)

# for adding dominance

vad = pd.read_csv('data/NRC-VAD-Lexicon.txt', delimiter='\t', index_col=0)
df['dom'] = df.lemma.apply(get_dom)

@@ -152,3 +167,5 @@ for n, gr in grs:

print(arg_agr.head(n=10))
#print(dom_agr.head(n=10))

# for adding super senses
 No newline at end of file
+51 −0
Original line number Diff line number Diff line
	within	across	normalized
email	20	20	1.0
signal	5	6	0.8333333333333334
highlight	6	8	0.75
gut	4	6	0.6666666666666666
reveal	21	32	0.65625
veto	5	10	0.5
lie	4	8	0.5
enter	3	6	0.5
struggle	3	6	0.5
pay	6	12	0.5
inform	4	8	0.5
delay	7	14	0.5
know	27	55	0.4909090909090909
kill	4	9	0.4444444444444444
relate	4	9	0.4444444444444444
defeat	4	9	0.4444444444444444
roll	6	14	0.42857142857142855
violate	3	7	0.42857142857142855
think	7	17	0.4117647058823529
hear	4	10	0.4
negotiate	4	10	0.4
investigate	8	20	0.4
intend	3	8	0.375
complain	4	11	0.36363636363636365
cast	4	11	0.36363636363636365
rig	4	11	0.36363636363636365
appoint	13	36	0.3611111111111111
describe	6	18	0.3333333333333333
watch	7	21	0.3333333333333333
create	5	15	0.3333333333333333
launch	5	15	0.3333333333333333
enact	2	6	0.3333333333333333
erase	2	6	0.3333333333333333
quote	2	6	0.3333333333333333
rescind	5	15	0.3333333333333333
direct	3	9	0.3333333333333333
rename	2	6	0.3333333333333333
sit	3	9	0.3333333333333333
pursue	3	9	0.3333333333333333
reverse	10	30	0.3333333333333333
address	4	12	0.3333333333333333
concern	2	6	0.3333333333333333
examine	2	6	0.3333333333333333
look	11	34	0.3235294117647059
question	5	16	0.3125
appear	12	39	0.3076923076923077
face	4	13	0.3076923076923077
believe	8	26	0.3076923076923077
begin	7	23	0.30434782608695654
learn	3	10	0.3
Loading