Loading code/filter_data.py 0 → 100644 +225 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 13 11:50:27 2019 @author: arslanna """ import json from bs4 import BeautifulSoup import requests import re from pandas import DataFrame import os INPUT = '../data/spektrum_links/json/' OUTPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json' WIKI = '../output/spektrum_links_output/wiki_links' ERRORS = '../output/spektrum_links_output/error_links' INVALID = '../output/spektrum_links_output/invalid_links' SM = '../output/spektrum_links_output/social_media_links' KEYWORDS = '../data/spektrum_keywords/spektrum_keyword_dict.json' p1 = re.compile('\[[0-9]+\.[0-9]+\]\s?') p2 = re.compile('(^|\s|\()(404|403|410)(\s|$|:|-|\))') def open_json(filename): """ opens a json file containing spektrum links reshapes the dictionary """ with open(filename, 'r') as f: data = json.load(f) # INPUT: dict_keys(['Id', 'Date', 'Title', 'Keywords', 'Source', 'Urls']) new_d = {} for el in data['Id'].keys(): ID = data['Id'][el] new_d[ID]={} new_d[ID]['Title']=data['Title'][el] new_d[ID]['Keywords']=data['Keywords'][el] new_d[ID]['Urls']=data['Urls'][el] # OUTPUT: {'Id':{'Title', 'Keywords', 'Urls'}} return new_d def write_errors(filename, data, i): filename = filename + '.txt' with open(filename, 'w') as f: for line in data: s = '' for n in range(i): s += line[n]+'\t' f.writelines(s+'\n') def errors_to_csv(filename, data): d = {'Id':{},'Title':{},'Urls':{}} for i, el in enumerate(data): d['Id'][i] = el[0] d['Title'][i] = el[1] d['Urls'][i] = el[2] df = DataFrame(d, columns= ['Id', 'Title','Urls']) #print(df) filename = filename + '.csv' df.to_csv (filename, index = None, header=True) def filter_urls(data): """ filters spektrum links removes links to german wikipedia, saves them to a file removes links to facebook, youtube and twitter, saves these links to a file removes links to german websites """ # links to german wikipedia wiki_de = [] # links to facebook, youtube or twitter sm_links = [] for el in data.keys(): urls = data[el]['Urls'] for url in urls: if 'de.wikipedia' in url: wiki_de.append([el, data[el]['Title'], url]) elif 'twitter.com' in url: sm_links.append([el, data[el]['Title'], url]) elif 'facebook.com' in url: sm_links.append([el, data[el]['Title'], url]) elif 'youtube.com' in url: sm_links.append([el, data[el]['Title'], url]) urls = [url for url in urls if '.de/' not in url] urls = [url for url in urls if not url.endswith('.de')] urls = [url for url in urls if 'de.wikipedia.org' not in url] urls = [url for url in urls if 'twitter.com' not in url] urls = [url for url in urls if 'facebook.com' not in url] urls = [url for url in urls if 'youtube.com' not in url] data[el]['Urls'] = urls write_errors(WIKI, wiki_de, 3) write_errors(SM, sm_links, 3) errors_to_csv(WIKI, wiki_de) errors_to_csv(SM, sm_links) return data def request_urls(data): """ frequests spektrum links removes not working links, saves them to a file removes working links, that lead to an error page, saves them to a file adds englisch titles of the working urls to input data adds structre information of the working urls to input data """ # not working links that raise an error if url is requested invalid_links = [] # working links that lead to an error page error_pages = [] for i, el in enumerate(data.keys()): urls = data[el]['Urls'] data[el]['Urls'] = {} #data[el]['En_Titles'] = {} for url in urls: try: html_doc = requests.get(url).text soup = BeautifulSoup(html_doc, 'html.parser') #"""!!!"""# title = soup.title.text.strip() error = check_error_page(title) if error == True: error_pages.append([el, data[el]['Title'], url, title]) else: data[el]['Urls'][url] = {'En_title':'','Abstract':None, 'Structure':0, 'Keyword':0, 'Pdfs':[]} data[el]['Urls'][url]['En_title'] = title structure_info = check_structure(soup) data[el]['Urls'][url]['Structure'] = structure_info[0] data[el]['Urls'][url]['Abstract'] = structure_info[1] data[el]['Urls'][url]['Keyword'] = count_keywords(data[el]['Keywords'],title) data[el]['Urls'][url]['Pdfs'] = check_pdf(soup) #print(data[el]['Urls'][url]) #print('*'*10) except: invalid_links.append([el, data[el]['Title'], url]) if i%100==0: print(str(i) + ' articles processed') write_errors(ERRORS, error_pages, 4) write_errors(INVALID, invalid_links, 3) errors_to_csv(ERRORS, error_pages) errors_to_csv(INVALID, invalid_links) return data def check_error_page(title): """ checks whether title of page indicates an error page """ er = ['not found', 'cannot be found', 'access denied', 'twitter / ?', 'page unavailable', 'redirecting', 'not be retrieved', 'bad title', 'article expired', 'error page', 'server error', '500 error'] for el in er: if el in title.lower(): return True if p2.search(title): return True else: return False def check_structure(soup): """ """ abstract = False c=0 headings = ['-1-']+[h.text for h in soup.find_all('h1')] headings = headings + ['-2-']+[h.text for h in soup.find_all('h2')] headings = headings + ['-3-']+[h.text for h in soup.find_all('h3')] headings = headings + ['-4-']+[h.text for h in soup.find_all('h4')] headings = headings + ['-5-']+[h.text for h in soup.find_all('h5')] headings = headings + ['-6-']+[h.text for h in soup.find_all('h6')] head = '' for h in headings: head += '\s'+h structure = ['Abstract','Introduction','Results','Discussion','References','Acknowledgements'] for word in structure: if word.lower() in head.lower(): c += 1 if 'abstract' in head.lower(): abstract = True return (c, abstract) def count_keywords(keywords, title): if keywords == None: return str(0) else: keywords = [w.strip() for w in keywords.split(',') if w != ' ' and len(w) != 0] en_keywords = [keyword_dict[word.lower()] for word in keywords if word.lower() in keyword_dict.keys()] i = 0 for keyword in en_keywords: if keyword.lower() in title.lower(): i+=1 if i == 0: key_rating = str(0) else: key_rating = str(i)+'/'+str(len(en_keywords)) return key_rating def check_pdf(soup): #, url """ """ links = [link.get('href') for link in soup.find_all('a')] links = [l for l in links if l!=None] if len(links)!=0: pdfs = [l for l in links if '.pdf' in l] pdfs = list(set(pdfs)) else: pdfs = [] return pdfs def rmv_links_witout_url(data): new_data = {} for el in data: if len(data[el]['Urls']) != 0: new_data[el] = data[el] return new_data if __name__ == '__main__': with open(KEYWORDS, 'r') as f: keyword_dict = json.load(f) #data = open_json(INPUT+'Spektrum_Links_export_20190304_142037_Sample.json') """ """ spektrum_files = os.listdir(INPUT) data = {} for file in spektrum_files: data.update(open_json(INPUT+file)) data = filter_urls(data) data = request_urls(data) with open(OUTPUT,'w') as f: json.dump(data, f) print(len(data), ' articles in original data') data = rmv_links_witout_url(data) print(len(data), ' articles in filtered data') Loading
code/filter_data.py 0 → 100644 +225 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Jun 13 11:50:27 2019 @author: arslanna """ import json from bs4 import BeautifulSoup import requests import re from pandas import DataFrame import os INPUT = '../data/spektrum_links/json/' OUTPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json' WIKI = '../output/spektrum_links_output/wiki_links' ERRORS = '../output/spektrum_links_output/error_links' INVALID = '../output/spektrum_links_output/invalid_links' SM = '../output/spektrum_links_output/social_media_links' KEYWORDS = '../data/spektrum_keywords/spektrum_keyword_dict.json' p1 = re.compile('\[[0-9]+\.[0-9]+\]\s?') p2 = re.compile('(^|\s|\()(404|403|410)(\s|$|:|-|\))') def open_json(filename): """ opens a json file containing spektrum links reshapes the dictionary """ with open(filename, 'r') as f: data = json.load(f) # INPUT: dict_keys(['Id', 'Date', 'Title', 'Keywords', 'Source', 'Urls']) new_d = {} for el in data['Id'].keys(): ID = data['Id'][el] new_d[ID]={} new_d[ID]['Title']=data['Title'][el] new_d[ID]['Keywords']=data['Keywords'][el] new_d[ID]['Urls']=data['Urls'][el] # OUTPUT: {'Id':{'Title', 'Keywords', 'Urls'}} return new_d def write_errors(filename, data, i): filename = filename + '.txt' with open(filename, 'w') as f: for line in data: s = '' for n in range(i): s += line[n]+'\t' f.writelines(s+'\n') def errors_to_csv(filename, data): d = {'Id':{},'Title':{},'Urls':{}} for i, el in enumerate(data): d['Id'][i] = el[0] d['Title'][i] = el[1] d['Urls'][i] = el[2] df = DataFrame(d, columns= ['Id', 'Title','Urls']) #print(df) filename = filename + '.csv' df.to_csv (filename, index = None, header=True) def filter_urls(data): """ filters spektrum links removes links to german wikipedia, saves them to a file removes links to facebook, youtube and twitter, saves these links to a file removes links to german websites """ # links to german wikipedia wiki_de = [] # links to facebook, youtube or twitter sm_links = [] for el in data.keys(): urls = data[el]['Urls'] for url in urls: if 'de.wikipedia' in url: wiki_de.append([el, data[el]['Title'], url]) elif 'twitter.com' in url: sm_links.append([el, data[el]['Title'], url]) elif 'facebook.com' in url: sm_links.append([el, data[el]['Title'], url]) elif 'youtube.com' in url: sm_links.append([el, data[el]['Title'], url]) urls = [url for url in urls if '.de/' not in url] urls = [url for url in urls if not url.endswith('.de')] urls = [url for url in urls if 'de.wikipedia.org' not in url] urls = [url for url in urls if 'twitter.com' not in url] urls = [url for url in urls if 'facebook.com' not in url] urls = [url for url in urls if 'youtube.com' not in url] data[el]['Urls'] = urls write_errors(WIKI, wiki_de, 3) write_errors(SM, sm_links, 3) errors_to_csv(WIKI, wiki_de) errors_to_csv(SM, sm_links) return data def request_urls(data): """ frequests spektrum links removes not working links, saves them to a file removes working links, that lead to an error page, saves them to a file adds englisch titles of the working urls to input data adds structre information of the working urls to input data """ # not working links that raise an error if url is requested invalid_links = [] # working links that lead to an error page error_pages = [] for i, el in enumerate(data.keys()): urls = data[el]['Urls'] data[el]['Urls'] = {} #data[el]['En_Titles'] = {} for url in urls: try: html_doc = requests.get(url).text soup = BeautifulSoup(html_doc, 'html.parser') #"""!!!"""# title = soup.title.text.strip() error = check_error_page(title) if error == True: error_pages.append([el, data[el]['Title'], url, title]) else: data[el]['Urls'][url] = {'En_title':'','Abstract':None, 'Structure':0, 'Keyword':0, 'Pdfs':[]} data[el]['Urls'][url]['En_title'] = title structure_info = check_structure(soup) data[el]['Urls'][url]['Structure'] = structure_info[0] data[el]['Urls'][url]['Abstract'] = structure_info[1] data[el]['Urls'][url]['Keyword'] = count_keywords(data[el]['Keywords'],title) data[el]['Urls'][url]['Pdfs'] = check_pdf(soup) #print(data[el]['Urls'][url]) #print('*'*10) except: invalid_links.append([el, data[el]['Title'], url]) if i%100==0: print(str(i) + ' articles processed') write_errors(ERRORS, error_pages, 4) write_errors(INVALID, invalid_links, 3) errors_to_csv(ERRORS, error_pages) errors_to_csv(INVALID, invalid_links) return data def check_error_page(title): """ checks whether title of page indicates an error page """ er = ['not found', 'cannot be found', 'access denied', 'twitter / ?', 'page unavailable', 'redirecting', 'not be retrieved', 'bad title', 'article expired', 'error page', 'server error', '500 error'] for el in er: if el in title.lower(): return True if p2.search(title): return True else: return False def check_structure(soup): """ """ abstract = False c=0 headings = ['-1-']+[h.text for h in soup.find_all('h1')] headings = headings + ['-2-']+[h.text for h in soup.find_all('h2')] headings = headings + ['-3-']+[h.text for h in soup.find_all('h3')] headings = headings + ['-4-']+[h.text for h in soup.find_all('h4')] headings = headings + ['-5-']+[h.text for h in soup.find_all('h5')] headings = headings + ['-6-']+[h.text for h in soup.find_all('h6')] head = '' for h in headings: head += '\s'+h structure = ['Abstract','Introduction','Results','Discussion','References','Acknowledgements'] for word in structure: if word.lower() in head.lower(): c += 1 if 'abstract' in head.lower(): abstract = True return (c, abstract) def count_keywords(keywords, title): if keywords == None: return str(0) else: keywords = [w.strip() for w in keywords.split(',') if w != ' ' and len(w) != 0] en_keywords = [keyword_dict[word.lower()] for word in keywords if word.lower() in keyword_dict.keys()] i = 0 for keyword in en_keywords: if keyword.lower() in title.lower(): i+=1 if i == 0: key_rating = str(0) else: key_rating = str(i)+'/'+str(len(en_keywords)) return key_rating def check_pdf(soup): #, url """ """ links = [link.get('href') for link in soup.find_all('a')] links = [l for l in links if l!=None] if len(links)!=0: pdfs = [l for l in links if '.pdf' in l] pdfs = list(set(pdfs)) else: pdfs = [] return pdfs def rmv_links_witout_url(data): new_data = {} for el in data: if len(data[el]['Urls']) != 0: new_data[el] = data[el] return new_data if __name__ == '__main__': with open(KEYWORDS, 'r') as f: keyword_dict = json.load(f) #data = open_json(INPUT+'Spektrum_Links_export_20190304_142037_Sample.json') """ """ spektrum_files = os.listdir(INPUT) data = {} for file in spektrum_files: data.update(open_json(INPUT+file)) data = filter_urls(data) data = request_urls(data) with open(OUTPUT,'w') as f: json.dump(data, f) print(len(data), ' articles in original data') data = rmv_links_witout_url(data) print(len(data), ' articles in filtered data')