Commit 79644a03 authored by Simon Will's avatar Simon Will
Browse files

Fix various things in corpus.py

parent 1be725b2
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
from . import config, db, meters, model, scan, scanner, wordlist
from . import config, corpus, db, meters, model, scan, scanner, wordlist
+55 −17
Original line number Diff line number Diff line
@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup

from .model import Reading, Syllable, Token

BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-type" >
<title>Plautus Amphitruo</title>
</head>
<body>
</body>
</html>
"""


class HypotacticLine:

@@ -14,7 +27,7 @@ class HypotacticLine:
        tokens = []
        span_begin = 0
        idx = 0
        for token_tag in element.children:
        for token_tag in element.find_all(name='span', class_='word'):
            syllables = []
            token_text = token_tag.text
            token = Token(
@@ -56,21 +69,25 @@ class HypotacticDocument:

    def __init__(self, file_path, parser='lxml'):
        with open(file_path) as f:
            try:
                self.root = BeautifulSoup(f, parser)
                self.title = self.root.title
            except Exception as e:
                print('Exception {!r} when parsing file {!r}'
                      .format(e, file_path))
                self.title = None

    def get_poems(self, filters=()):
    def get_poems(self, filters=tuple()):
        yield from (
            p
            for p in self.root.find_all(name='div', class_='poem')
            if all(fil(p) for fil in filters)
            poem
            for poem in self.root.find_all(name='div', class_='poem')
            if all(fil(poem) for fil in filters)
        )

    def get_lines(self, line_filters=(), poem_filters=()):
    def get_lines(self, line_filters=tuple()):
        yield from (
            line
            for poem in self.get_poems(poem_filters)
            for line in poem.find_all(name='div', class_='line')
            for line in self.root.find_all(name='div', class_='line')
            if all(fil(line) for fil in line_filters)
        )

@@ -89,16 +106,37 @@ class HypotacticCorpus:
                      for basename in os.listdir(directory)]
        return cls(file_paths, *args, **kwargs)

    def get_poems(self, filters=()):
    def get_poems(self, filters=tuple()):
        yield from (
            p
            poem
            for doc in self.documents
            for p in doc.get_poems(filters)
            for poem in doc.get_poems(filters)
        )

    def get_lines(self, line_filters=(), poem_filters=()):
    def get_lines(self, line_filters=tuple()):
        yield from (
            p
            line
            for doc in self.documents
            for p in doc.get_lines(line_filters, poem_filters)
            for line in doc.get_lines(line_filters)
        )

    def get_lines_with_meter(self, meters):
        filters = [lambda line: any((meter in line.attrs['class'])
                                    for meter in meters)]
        yield from self.get_lines(filters)

    def save_lines(self, file_handle, lines, title='Saved Poems',
                   base_html=BASE_HTML):
        soup = BeautifulSoup(base_html, self.parser)

        title_tag = soup.new_tag('title')
        title_tag.string = title
        soup.find(name='head').append(title_tag)

        latin = soup.new_tag('div')
        latin.attrs['class'] = 'latin'
        for line in lines:
            latin.append(line)
        soup.find(name='body').append(latin)

        file_handle.write(soup.prettify())