Commit fc7f14e1 authored by vdberg's avatar vdberg
Browse files

lemmatization improvedÄ

parent 2f288f16
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -56,10 +56,12 @@ parser.add_argument("-c", "--combine", action="store_true", default=False, help=
parser.add_argument("-a", "--arg", default=None, help="analyze args or not")
parser.add_argument("-n", "--n", type=int, default=50, help="nr of top verbs to show")
parser.add_argument("-mf", "--min_freq", type=int, default=5, help="min verb frequency to eliminate rarest verbs")
parser.add_argument("-l", "--lemmatize", action="store_true", default=False, help="whether to (re)do lemmatization")
args = parser.parse_args()
MIN_FREQ = args.min_freq
N = args.n
ARG = args.arg
LEMMATIZE = args.lemmatize

# load files
sent = pd.read_csv('data/sent_df.csv', names=['sent_id','bias','doc_id','s'])
@@ -92,8 +94,7 @@ pd.set_option('display.width', 1000)
def lemma(x):
    return nlp(x)[0].lemma_

lemmatize = False
if lemmatize:
if LEMMATIZE:
    nlp = spacy.load('en')
    df.verb = df.verb.apply(lemma) #['lemma']
    df.to_csv('frequent_verbs/lemmatized.csv')