added filter_data.py (a12766b4) · Commits · nwarslan / text_translation_and_summarization

code/filter_data.py

0 → 100644

+225 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --
		"""
		Created on Thu Jun 13 11:50:27 2019

		@author: arslanna
		"""
		import json
		from bs4 import BeautifulSoup
		import requests
		import re
		from pandas import DataFrame
		import os

		INPUT = '../data/spektrum_links/json/'
		OUTPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
		WIKI = '../output/spektrum_links_output/wiki_links'
		ERRORS = '../output/spektrum_links_output/error_links'
		INVALID = '../output/spektrum_links_output/invalid_links'
		SM = '../output/spektrum_links_output/social_media_links'
		KEYWORDS = '../data/spektrum_keywords/spektrum_keyword_dict.json'

		p1 = re.compile('\[[0-9]+\.[0-9]+\]\s?')
		p2 = re.compile('(^\|\s\|$)(404\|403\|410)(\s\|$\|:\|-\|$)')

		def open_json(filename):
		"""
		opens a json file containing spektrum links
		reshapes the dictionary
		"""
		with open(filename, 'r') as f:
		data = json.load(f)
		# INPUT: dict_keys(['Id', 'Date', 'Title', 'Keywords', 'Source', 'Urls'])
		new_d = {}
		for el in data['Id'].keys():
		ID = data['Id'][el]
		new_d[ID]={}
		new_d[ID]['Title']=data['Title'][el]
		new_d[ID]['Keywords']=data['Keywords'][el]
		new_d[ID]['Urls']=data['Urls'][el]
		# OUTPUT: {'Id':{'Title', 'Keywords', 'Urls'}}
		return new_d

		def write_errors(filename, data, i):
		filename = filename + '.txt'
		with open(filename, 'w') as f:
		for line in data:
		s = ''
		for n in range(i):
		s += line[n]+'\t'
		f.writelines(s+'\n')

		def errors_to_csv(filename, data):
		d = {'Id':{},'Title':{},'Urls':{}}
		for i, el in enumerate(data):
		d['Id'][i] = el[0]
		d['Title'][i] = el[1]
		d['Urls'][i] = el[2]
		df = DataFrame(d, columns= ['Id', 'Title','Urls'])
		#print(df)
		filename = filename + '.csv'
		df.to_csv (filename, index = None, header=True)

		def filter_urls(data):
		"""
		filters spektrum links
		removes links to german wikipedia, saves them to a file
		removes links to facebook, youtube and twitter, saves these links to a file
		removes links to german websites
		"""
		# links to german wikipedia
		wiki_de = []
		# links to facebook, youtube or twitter
		sm_links = []
		for el in data.keys():
		urls = data[el]['Urls']
		for url in urls:
		if 'de.wikipedia' in url: wiki_de.append([el, data[el]['Title'], url])
		elif 'twitter.com' in url: sm_links.append([el, data[el]['Title'], url])
		elif 'facebook.com' in url: sm_links.append([el, data[el]['Title'], url])
		elif 'youtube.com' in url: sm_links.append([el, data[el]['Title'], url])
		urls = [url for url in urls if '.de/' not in url]
		urls = [url for url in urls if not url.endswith('.de')]
		urls = [url for url in urls if 'de.wikipedia.org' not in url]
		urls = [url for url in urls if 'twitter.com' not in url]
		urls = [url for url in urls if 'facebook.com' not in url]
		urls = [url for url in urls if 'youtube.com' not in url]
		data[el]['Urls'] = urls
		write_errors(WIKI, wiki_de, 3)
		write_errors(SM, sm_links, 3)
		errors_to_csv(WIKI, wiki_de)
		errors_to_csv(SM, sm_links)
		return data

		def request_urls(data):
		"""
		frequests spektrum links
		removes not working links, saves them to a file
		removes working links, that lead to an error page, saves them to a file
		adds englisch titles of the working urls to input data
		adds structre information of the working urls to input data
		"""
		# not working links that raise an error if url is requested
		invalid_links = []
		# working links that lead to an error page
		error_pages = []
		for i, el in enumerate(data.keys()):
		urls = data[el]['Urls']
		data[el]['Urls'] = {}
		#data[el]['En_Titles'] = {}
		for url in urls:
		try:
		html_doc = requests.get(url).text
		soup = BeautifulSoup(html_doc, 'html.parser') #"""!!!"""#
		title = soup.title.text.strip()
		error = check_error_page(title)
		if error == True:
		error_pages.append([el, data[el]['Title'], url, title])
		else:
		data[el]['Urls'][url] = {'En_title':'','Abstract':None, 'Structure':0, 'Keyword':0, 'Pdfs':[]}
		data[el]['Urls'][url]['En_title'] = title
		structure_info = check_structure(soup)
		data[el]['Urls'][url]['Structure'] = structure_info[0]
		data[el]['Urls'][url]['Abstract'] = structure_info[1]
		data[el]['Urls'][url]['Keyword'] = count_keywords(data[el]['Keywords'],title)
		data[el]['Urls'][url]['Pdfs'] = check_pdf(soup)
		#print(data[el]['Urls'][url])
		#print(''10)
		except:
		invalid_links.append([el, data[el]['Title'], url])
		if i%100==0: print(str(i) + ' articles processed')
		write_errors(ERRORS, error_pages, 4)
		write_errors(INVALID, invalid_links, 3)
		errors_to_csv(ERRORS, error_pages)
		errors_to_csv(INVALID, invalid_links)

		return data

		def check_error_page(title):
		"""
		checks whether title of page indicates an error page
		"""
		er = ['not found', 'cannot be found', 'access denied', 'twitter / ?', 'page unavailable', 'redirecting', 'not be retrieved', 'bad title', 'article expired', 'error page', 'server error', '500 error']
		for el in er:
		if el in title.lower():
		return True
		if p2.search(title):
		return True
		else:
		return False

		def check_structure(soup):
		"""

		"""
		abstract = False
		c=0
		headings = ['-1-']+[h.text for h in soup.find_all('h1')]
		headings = headings + ['-2-']+[h.text for h in soup.find_all('h2')]
		headings = headings + ['-3-']+[h.text for h in soup.find_all('h3')]
		headings = headings + ['-4-']+[h.text for h in soup.find_all('h4')]
		headings = headings + ['-5-']+[h.text for h in soup.find_all('h5')]
		headings = headings + ['-6-']+[h.text for h in soup.find_all('h6')]
		head = ''
		for h in headings: head += '\s'+h
		structure = ['Abstract','Introduction','Results','Discussion','References','Acknowledgements']
		for word in structure:
		if word.lower() in head.lower(): c += 1
		if 'abstract' in head.lower(): abstract = True
		return (c, abstract)

		def count_keywords(keywords, title):
		if keywords == None:
		return str(0)
		else:
		keywords = [w.strip() for w in keywords.split(',') if w != ' ' and len(w) != 0]
		en_keywords = [keyword_dict[word.lower()] for word in keywords if word.lower() in keyword_dict.keys()]
		i = 0
		for keyword in en_keywords:
		if keyword.lower() in title.lower(): i+=1
		if i == 0:
		key_rating = str(0)
		else:
		key_rating = str(i)+'/'+str(len(en_keywords))
		return key_rating

		def check_pdf(soup): #, url
		"""

		"""
		links = [link.get('href') for link in soup.find_all('a')]
		links = [l for l in links if l!=None]
		if len(links)!=0:
		pdfs = [l for l in links if '.pdf' in l]
		pdfs = list(set(pdfs))
		else: pdfs = []
		return pdfs

		def rmv_links_witout_url(data):
		new_data = {}
		for el in data:
		if len(data[el]['Urls']) != 0:
		new_data[el] = data[el]
		return new_data

		if __name__ == '__main__':
		with open(KEYWORDS, 'r') as f:
		keyword_dict = json.load(f)
		#data = open_json(INPUT+'Spektrum_Links_export_20190304_142037_Sample.json')
		"""
		"""
		spektrum_files = os.listdir(INPUT)
		data = {}
		for file in spektrum_files:
		data.update(open_json(INPUT+file))

		data = filter_urls(data)
		data = request_urls(data)

		with open(OUTPUT,'w') as f:
		json.dump(data, f)

		print(len(data), ' articles in original data')
		data = rmv_links_witout_url(data)
		print(len(data), ' articles in filtered data')