Commit 25623fb7 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Attempt to read Wiki in chunks

parent 98ce9184
Loading
Loading
Loading
Loading
+16 −7
Original line number Diff line number Diff line
@@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training."""
import pandas as pd

df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
df["occurrences_in_wikipedia"] = 0

with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
    wikipedia_text = f.read()
CHUNK_SIZE = 1024 * 1024  # 1 MB

df["occurrences_in_wikipedia"] = 0
with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
    CHUNK_NO = 1
    while True:
        print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
        wikipedia_text = f.read(CHUNK_SIZE)
        if not wikipedia_text:
            print("reading completed")
            break

        for index, row in df.iterrows():
            name = row['name']
            count = wikipedia_text.count(name)
    df.at[index, "occurrences_in_wikipedia"] = count
            df.at[index, "occurrences_in_wikipedia"] += count
        
        CHUNK_NO += 1

df.to_csv("./data/names_nationality_wikipedia.csv", index=False)