Loading count_name_occurrences.py +16 −7 Original line number Diff line number Diff line Loading @@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"]) df["occurrences_in_wikipedia"] = 0 with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: wikipedia_text = f.read() CHUNK_SIZE = 1024 * 1024 # 1 MB df["occurrences_in_wikipedia"] = 0 with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f: CHUNK_NO = 1 while True: print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r") wikipedia_text = f.read(CHUNK_SIZE) if not wikipedia_text: print("reading completed") break for index, row in df.iterrows(): name = row['name'] count = wikipedia_text.count(name) df.at[index, "occurrences_in_wikipedia"] = count df.at[index, "occurrences_in_wikipedia"] += count CHUNK_NO += 1 df.to_csv("./data/names_nationality_wikipedia.csv", index=False) Loading Loading
count_name_occurrences.py +16 −7 Original line number Diff line number Diff line Loading @@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"]) df["occurrences_in_wikipedia"] = 0 with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: wikipedia_text = f.read() CHUNK_SIZE = 1024 * 1024 # 1 MB df["occurrences_in_wikipedia"] = 0 with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f: CHUNK_NO = 1 while True: print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r") wikipedia_text = f.read(CHUNK_SIZE) if not wikipedia_text: print("reading completed") break for index, row in df.iterrows(): name = row['name'] count = wikipedia_text.count(name) df.at[index, "occurrences_in_wikipedia"] = count df.at[index, "occurrences_in_wikipedia"] += count CHUNK_NO += 1 df.to_csv("./data/names_nationality_wikipedia.csv", index=False) Loading