Attempt to read Wiki in chunks (25623fb7) · Commits · Aileen Reichelt / Analysing and Mitigating Origin Bias in German Word Embeddings

count_name_occurrences.py

+16 −7

Original line number	Diff line number	Diff line
		@@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training."""
		import pandas as pd

		df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
		df["occurrences_in_wikipedia"] = 0

		with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
		wikipedia_text = f.read()
		CHUNK_SIZE = 1024 * 1024 # 1 MB

		df["occurrences_in_wikipedia"] = 0
		with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
		CHUNK_NO = 1
		while True:
		print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
		wikipedia_text = f.read(CHUNK_SIZE)
		if not wikipedia_text:
		print("reading completed")
		break

		for index, row in df.iterrows():
		name = row['name']
		count = wikipedia_text.count(name)
		df.at[index, "occurrences_in_wikipedia"] = count
		df.at[index, "occurrences_in_wikipedia"] += count

		CHUNK_NO += 1

		df.to_csv("./data/names_nationality_wikipedia.csv", index=False)