Loading allzweckmesser/__init__.py +1 −1 Original line number Diff line number Diff line from . import config, db, meters, model, scan, scanner, wordlist from . import config, corpus, db, meters, model, scan, scanner, wordlist allzweckmesser/corpus.py +55 −17 Original line number Diff line number Diff line Loading @@ -6,6 +6,19 @@ from bs4 import BeautifulSoup from .model import Reading, Syllable, Token BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta content="text/html; charset=utf-8" http-equiv="Content-type" > <title>Plautus Amphitruo</title> </head> <body> </body> </html> """ class HypotacticLine: Loading @@ -14,7 +27,7 @@ class HypotacticLine: tokens = [] span_begin = 0 idx = 0 for token_tag in element.children: for token_tag in element.find_all(name='span', class_='word'): syllables = [] token_text = token_tag.text token = Token( Loading Loading @@ -56,21 +69,25 @@ class HypotacticDocument: def __init__(self, file_path, parser='lxml'): with open(file_path) as f: try: self.root = BeautifulSoup(f, parser) self.title = self.root.title except Exception as e: print('Exception {!r} when parsing file {!r}' .format(e, file_path)) self.title = None def get_poems(self, filters=()): def get_poems(self, filters=tuple()): yield from ( p for p in self.root.find_all(name='div', class_='poem') if all(fil(p) for fil in filters) poem for poem in self.root.find_all(name='div', class_='poem') if all(fil(poem) for fil in filters) ) def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()): yield from ( line for poem in self.get_poems(poem_filters) for line in poem.find_all(name='div', class_='line') for line in self.root.find_all(name='div', class_='line') if all(fil(line) for fil in line_filters) ) Loading @@ -89,16 +106,37 @@ class HypotacticCorpus: for basename in os.listdir(directory)] return cls(file_paths, *args, **kwargs) def get_poems(self, filters=()): def get_poems(self, filters=tuple()): yield from ( p poem for doc in self.documents for p in doc.get_poems(filters) for poem in doc.get_poems(filters) ) def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()): yield from ( p line for doc in self.documents for p in doc.get_lines(line_filters, poem_filters) for line in doc.get_lines(line_filters) ) def get_lines_with_meter(self, meters): filters = [lambda line: any((meter in line.attrs['class']) for meter in meters)] yield from self.get_lines(filters) def save_lines(self, file_handle, lines, title='Saved Poems', base_html=BASE_HTML): soup = BeautifulSoup(base_html, self.parser) title_tag = soup.new_tag('title') title_tag.string = title soup.find(name='head').append(title_tag) latin = soup.new_tag('div') latin.attrs['class'] = 'latin' for line in lines: latin.append(line) soup.find(name='body').append(latin) file_handle.write(soup.prettify()) Loading
allzweckmesser/__init__.py +1 −1 Original line number Diff line number Diff line from . import config, db, meters, model, scan, scanner, wordlist from . import config, corpus, db, meters, model, scan, scanner, wordlist
allzweckmesser/corpus.py +55 −17 Original line number Diff line number Diff line Loading @@ -6,6 +6,19 @@ from bs4 import BeautifulSoup from .model import Reading, Syllable, Token BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta content="text/html; charset=utf-8" http-equiv="Content-type" > <title>Plautus Amphitruo</title> </head> <body> </body> </html> """ class HypotacticLine: Loading @@ -14,7 +27,7 @@ class HypotacticLine: tokens = [] span_begin = 0 idx = 0 for token_tag in element.children: for token_tag in element.find_all(name='span', class_='word'): syllables = [] token_text = token_tag.text token = Token( Loading Loading @@ -56,21 +69,25 @@ class HypotacticDocument: def __init__(self, file_path, parser='lxml'): with open(file_path) as f: try: self.root = BeautifulSoup(f, parser) self.title = self.root.title except Exception as e: print('Exception {!r} when parsing file {!r}' .format(e, file_path)) self.title = None def get_poems(self, filters=()): def get_poems(self, filters=tuple()): yield from ( p for p in self.root.find_all(name='div', class_='poem') if all(fil(p) for fil in filters) poem for poem in self.root.find_all(name='div', class_='poem') if all(fil(poem) for fil in filters) ) def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()): yield from ( line for poem in self.get_poems(poem_filters) for line in poem.find_all(name='div', class_='line') for line in self.root.find_all(name='div', class_='line') if all(fil(line) for fil in line_filters) ) Loading @@ -89,16 +106,37 @@ class HypotacticCorpus: for basename in os.listdir(directory)] return cls(file_paths, *args, **kwargs) def get_poems(self, filters=()): def get_poems(self, filters=tuple()): yield from ( p poem for doc in self.documents for p in doc.get_poems(filters) for poem in doc.get_poems(filters) ) def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()): yield from ( p line for doc in self.documents for p in doc.get_lines(line_filters, poem_filters) for line in doc.get_lines(line_filters) ) def get_lines_with_meter(self, meters): filters = [lambda line: any((meter in line.attrs['class']) for meter in meters)] yield from self.get_lines(filters) def save_lines(self, file_handle, lines, title='Saved Poems', base_html=BASE_HTML): soup = BeautifulSoup(base_html, self.parser) title_tag = soup.new_tag('title') title_tag.string = title soup.find(name='head').append(title_tag) latin = soup.new_tag('div') latin.attrs['class'] = 'latin' for line in lines: latin.append(line) soup.find(name='body').append(latin) file_handle.write(soup.prettify())