Fix various things in corpus.py (79644a03) · Commits · Messerschleifer / Allzweckmesser

allzweckmesser/init.py

+1 −1

Original line number	Diff line number	Diff line
		from . import config, db, meters, model, scan, scanner, wordlist
		from . import config, corpus, db, meters, model, scan, scanner, wordlist

allzweckmesser/corpus.py

+55 −17

Original line number	Diff line number	Diff line
		@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup

		from .model import Reading, Syllable, Token

		BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
		"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

		<html xmlns="http://www.w3.org/1999/xhtml">
		<head>
		<meta content="text/html; charset=utf-8" http-equiv="Content-type" >
		<title>Plautus Amphitruo</title>
		</head>
		<body>
		</body>
		</html>
		"""


		class HypotacticLine:

		@@ -14,7 +27,7 @@ class HypotacticLine:
		tokens = []
		span_begin = 0
		idx = 0
		for token_tag in element.children:
		for token_tag in element.find_all(name='span', class_='word'):
		syllables = []
		token_text = token_tag.text
		token = Token(
		@@ -56,21 +69,25 @@ class HypotacticDocument:

		def __init__(self, file_path, parser='lxml'):
		with open(file_path) as f:
		try:
		self.root = BeautifulSoup(f, parser)
		self.title = self.root.title
		except Exception as e:
		print('Exception {!r} when parsing file {!r}'
		.format(e, file_path))
		self.title = None

		def get_poems(self, filters=()):
		def get_poems(self, filters=tuple()):
		yield from (
		p
		for p in self.root.find_all(name='div', class_='poem')
		if all(fil(p) for fil in filters)
		poem
		for poem in self.root.find_all(name='div', class_='poem')
		if all(fil(poem) for fil in filters)
		)

		def get_lines(self, line_filters=(), poem_filters=()):
		def get_lines(self, line_filters=tuple()):
		yield from (
		line
		for poem in self.get_poems(poem_filters)
		for line in poem.find_all(name='div', class_='line')
		for line in self.root.find_all(name='div', class_='line')
		if all(fil(line) for fil in line_filters)
		)

		@@ -89,16 +106,37 @@ class HypotacticCorpus:
		for basename in os.listdir(directory)]
		return cls(file_paths, args, *kwargs)

		def get_poems(self, filters=()):
		def get_poems(self, filters=tuple()):
		yield from (
		p
		poem
		for doc in self.documents
		for p in doc.get_poems(filters)
		for poem in doc.get_poems(filters)
		)

		def get_lines(self, line_filters=(), poem_filters=()):
		def get_lines(self, line_filters=tuple()):
		yield from (
		p
		line
		for doc in self.documents
		for p in doc.get_lines(line_filters, poem_filters)
		for line in doc.get_lines(line_filters)
		)

		def get_lines_with_meter(self, meters):
		filters = [lambda line: any((meter in line.attrs['class'])
		for meter in meters)]
		yield from self.get_lines(filters)

		def save_lines(self, file_handle, lines, title='Saved Poems',
		base_html=BASE_HTML):
		soup = BeautifulSoup(base_html, self.parser)

		title_tag = soup.new_tag('title')
		title_tag.string = title
		soup.find(name='head').append(title_tag)

		latin = soup.new_tag('div')
		latin.attrs['class'] = 'latin'
		for line in lines:
		latin.append(line)
		soup.find(name='body').append(latin)

		file_handle.write(soup.prettify())