Commit a12766b4 authored by nwarslan's avatar nwarslan
Browse files

added filter_data.py

parent f1943575
Loading
Loading
Loading
Loading

code/filter_data.py

0 → 100644
+225 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 13 11:50:27 2019

@author: arslanna
"""
import json
from bs4 import BeautifulSoup
import requests
import re
from pandas import DataFrame
import os

INPUT = '../data/spektrum_links/json/'
OUTPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
WIKI = '../output/spektrum_links_output/wiki_links'
ERRORS = '../output/spektrum_links_output/error_links'
INVALID = '../output/spektrum_links_output/invalid_links'
SM = '../output/spektrum_links_output/social_media_links'
KEYWORDS = '../data/spektrum_keywords/spektrum_keyword_dict.json'

p1 = re.compile('\[[0-9]+\.[0-9]+\]\s?')
p2 = re.compile('(^|\s|\()(404|403|410)(\s|$|:|-|\))')

def open_json(filename):
    """
    opens a json file containing spektrum links
    reshapes the dictionary 
    """
    with open(filename, 'r') as f:
        data = json.load(f)
        # INPUT: dict_keys(['Id', 'Date', 'Title', 'Keywords', 'Source', 'Urls'])
    new_d = {}
    for el in data['Id'].keys():
        ID = data['Id'][el]
        new_d[ID]={}
        new_d[ID]['Title']=data['Title'][el]
        new_d[ID]['Keywords']=data['Keywords'][el]
        new_d[ID]['Urls']=data['Urls'][el]
        # OUTPUT: {'Id':{'Title', 'Keywords', 'Urls'}}
    return new_d

def write_errors(filename, data, i):
    filename = filename + '.txt'
    with open(filename, 'w') as f:
        for line in data:
            s = ''
            for n in range(i):
                s += line[n]+'\t'
            f.writelines(s+'\n')
            
def errors_to_csv(filename, data):
    d = {'Id':{},'Title':{},'Urls':{}}
    for i, el in enumerate(data):
        d['Id'][i] = el[0]
        d['Title'][i] = el[1]
        d['Urls'][i] = el[2]
    df = DataFrame(d, columns= ['Id', 'Title','Urls'])
    #print(df)
    filename = filename + '.csv'
    df.to_csv (filename, index = None, header=True)
           
def filter_urls(data):
    """
    filters spektrum links
    removes links to german wikipedia, saves them to a file
    removes links to facebook, youtube and twitter, saves these links to a file
    removes links to german websites
    """
    # links to german wikipedia
    wiki_de = []
    # links to facebook, youtube or twitter
    sm_links = []
    for el in data.keys():
        urls = data[el]['Urls']
        for url in urls:
            if 'de.wikipedia' in url: wiki_de.append([el, data[el]['Title'], url])
            elif 'twitter.com' in url: sm_links.append([el, data[el]['Title'], url])
            elif 'facebook.com' in url: sm_links.append([el, data[el]['Title'], url])
            elif 'youtube.com' in url: sm_links.append([el, data[el]['Title'], url])
        urls = [url for url in urls if '.de/' not in url]
        urls = [url for url in urls if not url.endswith('.de')]
        urls = [url for url in urls if 'de.wikipedia.org' not in url]
        urls = [url for url in urls if 'twitter.com' not in url]
        urls = [url for url in urls if 'facebook.com' not in url]
        urls = [url for url in urls if 'youtube.com' not in url]      
        data[el]['Urls'] = urls
    write_errors(WIKI, wiki_de, 3)
    write_errors(SM, sm_links, 3)
    errors_to_csv(WIKI, wiki_de)
    errors_to_csv(SM, sm_links)
    return data

def request_urls(data):
    """
    frequests spektrum links
    removes not working links, saves them to a file
    removes working links, that lead to an error page, saves them to a file
    adds englisch titles of the working urls to input data
    adds structre information of the working urls to input data
    """
    # not working links that raise an error if url is requested
    invalid_links = []
    # working links that lead to an error page
    error_pages = []
    for i, el in enumerate(data.keys()):
        urls = data[el]['Urls']
        data[el]['Urls'] = {}
        #data[el]['En_Titles'] = {}
        for url in urls:
            try:
                html_doc = requests.get(url).text
                soup = BeautifulSoup(html_doc, 'html.parser') #"""!!!"""#
                title = soup.title.text.strip()
                error = check_error_page(title)
                if error == True: 
                    error_pages.append([el, data[el]['Title'], url, title])
                else:
                    data[el]['Urls'][url] = {'En_title':'','Abstract':None, 'Structure':0, 'Keyword':0, 'Pdfs':[]}
                    data[el]['Urls'][url]['En_title'] = title
                    structure_info = check_structure(soup)
                    data[el]['Urls'][url]['Structure'] = structure_info[0]
                    data[el]['Urls'][url]['Abstract'] = structure_info[1]
                    data[el]['Urls'][url]['Keyword'] = count_keywords(data[el]['Keywords'],title)
                    data[el]['Urls'][url]['Pdfs'] = check_pdf(soup)
                    #print(data[el]['Urls'][url])
                    #print('*'*10)
            except: 
                invalid_links.append([el, data[el]['Title'], url])
        if i%100==0: print(str(i) + ' articles processed')
    write_errors(ERRORS, error_pages, 4)
    write_errors(INVALID, invalid_links, 3)
    errors_to_csv(ERRORS, error_pages)
    errors_to_csv(INVALID, invalid_links)
    
    return data
   
def check_error_page(title):
    """
    checks whether title of page indicates an error page
    """
    er = ['not found', 'cannot be found', 'access denied', 'twitter / ?', 'page unavailable', 'redirecting', 'not be retrieved', 'bad title', 'article expired', 'error page', 'server error', '500 error']
    for el in er:
        if el in title.lower():
            return True
    if p2.search(title):
            return True
    else: 
        return False

def check_structure(soup):
    """
    
    """
    abstract = False
    c=0
    headings = ['-1-']+[h.text for h in soup.find_all('h1')]
    headings = headings + ['-2-']+[h.text for h in soup.find_all('h2')]
    headings = headings + ['-3-']+[h.text for h in soup.find_all('h3')]
    headings = headings + ['-4-']+[h.text for h in soup.find_all('h4')]
    headings = headings + ['-5-']+[h.text for h in soup.find_all('h5')]
    headings = headings + ['-6-']+[h.text for h in soup.find_all('h6')]
    head = ''
    for h in headings: head += '\s'+h
    structure = ['Abstract','Introduction','Results','Discussion','References','Acknowledgements']
    for word in structure:
        if word.lower() in head.lower(): c += 1
    if 'abstract' in head.lower(): abstract = True
    return (c, abstract)

def count_keywords(keywords, title):
    if keywords == None:
        return str(0)
    else:
        keywords = [w.strip() for w in keywords.split(',') if w != ' ' and len(w) != 0]
        en_keywords = [keyword_dict[word.lower()] for word in keywords if word.lower() in keyword_dict.keys()]
        i = 0
        for keyword in en_keywords:
            if keyword.lower() in title.lower(): i+=1
            if i == 0: 
                key_rating = str(0)
            else:
                key_rating = str(i)+'/'+str(len(en_keywords))
        return key_rating
    
def check_pdf(soup): #, url
    """
    
    """
    links = [link.get('href') for link in soup.find_all('a')]
    links = [l for l in links if l!=None]
    if len(links)!=0:
        pdfs = [l for l in links if '.pdf' in l]
        pdfs = list(set(pdfs))
    else: pdfs = []
    return pdfs

def rmv_links_witout_url(data):
    new_data = {}
    for el in data:
        if len(data[el]['Urls']) != 0:
            new_data[el] = data[el]
    return new_data

if __name__ == '__main__':
    with open(KEYWORDS, 'r') as f:
        keyword_dict = json.load(f)
    #data = open_json(INPUT+'Spektrum_Links_export_20190304_142037_Sample.json')
    """
    """ 
    spektrum_files = os.listdir(INPUT)
    data = {}
    for file in spektrum_files:
        data.update(open_json(INPUT+file))
    
    data = filter_urls(data)
    data = request_urls(data)
    
    with open(OUTPUT,'w') as f:
        json.dump(data, f)
        
    print(len(data), ' articles in original data')
    data = rmv_links_witout_url(data)
    print(len(data), ' articles in filtered data')