Commit 32a5209c authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Extract Text

parent 15403abf
Loading
Loading
Loading
Loading
+80 −0
Original line number Diff line number Diff line
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler
import os
import pandas as pd
import statistics
import re
from nltk.probability import FreqDist

#file header: 
# work_id,title,author,rating,category,fandom,relationship,character,additional tags,language,published,status,status date,words,chapters,comments,kudos,bookmarks,hits,all_kudos,all_bookmarks,body
# 27852922,Dealing with Our Demons,['ravenyenn19'],Mature,F/M,"Six of Crows Series",Kaz Brekker/Inej Ghafa,"Kaz B","Romance,Kanej - Freeform, Eventual Smut",English,2020-12-03,Updated,2023-03-16,747673,162/?,8573,12204,1373,709212,"['ud4m', 'book_addict_1228', 'ephemeraldelights', 'bluedelilah25', 'sunshinecorsets', 'I_do_not_like_purple_glasses', 'beep_boop_00', 'schleswigholstein', 'moonandstars75', 'ewerythingoes', 'mindfighters', 'rosibunnis', 'Lizie06', 'ghostlatte', 'aguswolman', 'QueenofEnglan', 'JenBoyette04', 'gnitneb_reads', 'gloomysunshine', 'v1ofvs', 'BazzaKrekker', 'BookGeek', 'poppyflower19', 'Cassanibal', 'vanilla_chai_tea', 'Honorthyword', 'mariaarmengol', 'luc1inda', 'zarawrites', 'monmough', 'Guilty__Pleasures', 'Ilyann', 'folieadeux_0_0', 'dragonguard', 'Emeliemarx', 'angrydabee', 'slythxrclaw', 'samaram0215', 'letsgetthisbread69', 'Mintmew', 'biblichour', 'Katloupet', 'Miss_ginger', 'inejsquake', 'Arabella_7833', 'flossy_flo99', 'a_k123', 'hushedwanderer', 'siriuslymichele', 'AnnaAvinaVTDX']",[],"Dear Kaz,


grisha_fanfics = pd.read_csv("grishaverse/data/fanfics/grishaverse_fics.csv")
tog_fanfics = pd.read_csv("throne_of_glass/data/fanfics/throne_of_glass_fics.csv")
"""
grisha_kudos = grisha_fanfics["kudos"].values.tolist()

grisha_kudos_freq_dist = FreqDist(grisha_kudos)
# convert to FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(grisha_kudos_freq_dist))
#print(dist_panda)


# plot using matplotlib and seaborn 

# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))

# call function for bar (value) labels 
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)

plt.title("Grishaverse Frequency Distribution of All Kudos")
ax.set_xlabel("Number of Kudos")
ax.set_ylabel("Percentage of Occurence")


sns.lineplot(x=dist_panda.index, y=dist_panda.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(f"grishaverse/freq_distribution/fanfic_kudo_freq_dist.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
"""

def preprocess_data(df, series):
    good_fics = []
    medium_fics = []
    bad_fics = []
    few_kudos = 100
    medium_kudos = 1500

    for index, row in df.iterrows():
        published = pd.to_datetime(row["published"])
        if published.year != 2023:
            if not pd.isna(row["kudos"]):
                kudos = pd.to_numeric(row["kudos"], errors="coerce")
                if kudos <= few_kudos:
                    bad_fics.append(row["body"])
                elif kudos <= medium_kudos:
                    medium_fics.append(row["body"])
                elif kudos > medium_kudos:
                    good_fics.append(row["body"])
            else:
                print(f"Missing kudos value for row {index}")

    bad_fics_joined = ''.join(map(str, bad_fics))
    good_fics_joined = ''.join(map(str, good_fics))
    medium_fics_joined = ''.join(map(str, medium_fics))

    with open(f"{series}/data/split_txt_fanfics/good_fics.txt", "w") as f:
        f.write(good_fics_joined)

    with open(f"{series}/data/split_txt_fanfics/bad_fics.txt", "w") as f:
        f.write(bad_fics_joined)

    with open(f"{series}/data/split_txt_fanfics/medium_fics.txt", "w") as f:
        f.write(medium_fics_joined)


preprocess_data(grisha_fanfics, "grishaverse")
preprocess_data(tog_fanfics, "throne_of_glass")
+0 −0

File added.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+433890 −0

File added.

Preview size limit exceeded, changes collapsed.

+0 −0

File added.

Preview suppressed by a .gitattributes entry or the file's encoding is unsupported.

+2.3 KiB (55 KiB)
Loading image diff...
Loading