Commit 3574bdf4 authored by Simon Will's avatar Simon Will
Browse files

Improve Hypotactic reading and add tests

parent 0bd0a301
Loading
Loading
Loading
Loading
+97 −41
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-

import logging
import re
import os.path

from bs4 import BeautifulSoup
@@ -19,10 +21,7 @@ BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
"""


class HypotacticLine:

    def __init__(self, element):
        self.element = element
def get_reading_from_line_element(element):
    tokens = []
    span_begin = 0
    idx = 0
@@ -31,10 +30,10 @@ class HypotacticLine:
        token_text = token_tag.text
        token = Token(
            token=token_text,
                span=(span_begin, span_begin + len(token_text))
            span=[span_begin, span_begin + len(token_text)]
        )

            for syllable_tag in token_tag.children:
        for syllable_tag in token_tag.find_all(name='span', class_='syll'):
            syllable_text = syllable_tag.text
            if 'long' in syllable_tag.attrs['class']:
                syllable_length = 2
@@ -50,18 +49,75 @@ class HypotacticLine:
            syllable = Syllable(
                idx=idx,
                syllable=syllable_text,
                    span=(span_begin, span_begin + len(syllable_text)),
                span=[span_begin, span_begin + len(syllable_text)],
                syllable_length=syllable_length,
                vowel_length=None
            )
            idx += 1
            syllables.append(syllable)
            span_begin += len(syllable_text)
        # The + 1 is for simulating a space between tokens.
        span_begin += 1

        token.syllables = syllables
        tokens.append(token)

        self.reading = Reading(tokens=tokens)
    return Reading(tokens=tokens)


def separate_punctuation(tokens):
    i = 0
    while i < len(tokens):
        token = tokens[i]
        m = re.match(r'^(?P<pre_punct>[\W_]*)(?P<non_punct>\w*)'
                        '(?P<post_punct>[\W_]*)$',
                        token.text)
        if m:
            pre = m.group('pre_punct')
            post = m.group('post_punct')

            # Create tokens for the punctuation before a token.
            span_begin = token.span[0]
            for c in pre:
                tokens.insert(i, Token(c, [span_begin, span_begin + 1]))
                span_begin += 1
                i += 1

            # Create tokens for the punctuation after a token.
            span_begin = token.span[1] - len(post)
            for c in m.group('post_punct'):
                tokens.insert(i + 1,
                                Token(c, [span_begin, span_begin + 1]))
                span_begin += 1
                i += 1

            # Remove the punctuation from the original token and
            # from its syllables.
            token.text = m.group('non_punct')
            span_begin = token.span[0] + m.start('non_punct')
            span_end = token.span[1] - len(post)
            token.span = [span_begin, span_end]
            if pre:
                token.syllables[0].text = token.syllables[0].text[len(pre):]
                token.syllables[0].span[0] = span_begin
            if post:
                token.syllables[-1].text = token.syllables[-1].text[:-len(post)]
                token.syllables[-1].span[1] = span_end

        else:
            logging.warn('{!r} does not match the punctuation regex.'
                            .format(token))
        i += 1

    return tokens


class HypotacticLine:

    def __init__(self, element):
        self.element = element
        self.reading = get_reading_from_line_element(element)
        reading.tokens = separate_punctuation(reading.tokens)


class HypotacticDocument:
@@ -138,7 +194,7 @@ class HypotacticCorpus:
            for line in doc.get_lines_with_meter(meters)
        )

    def save_lines(self, file_handle, lines, title='Saved Poems',
    def save_html_tags(self, file_handle, tags, title='Saved Poems',
                   base_html=BASE_HTML, pretty=False):
        soup = BeautifulSoup(base_html, self.parser)

@@ -148,8 +204,8 @@ class HypotacticCorpus:

        latin = soup.new_tag('div')
        latin.attrs['class'] = 'latin'
        for line in lines:
            latin.append(line)
        for tag in tags:
            latin.append(tag)
        soup.find(name='body').append(latin)

        if pretty:
+1 −1
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ def main(hypotactic_dir, outfile, limit=0, title='Lines', meters=tuple()):
    if limit:
        line_generator = itertools.islice(line_generator, limit)
    with open(outfile, 'w') as f:
        corpus.save_lines(f, line_generator, title=title)
        corpus.save_html_tags(f, line_generator, title=title)


def parse_args_and_main():

tests/test_corpus.py

0 → 100644
+106 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import pytest

import allzweckmesser as azm


@pytest.fixture
def aratea_element():
    # The line is:
    # 'vertitur: [hanc nemo certo tibi dicere possit,'
    return BeautifulSoup(
        '<div class="line"><span class="word"><span class="syll'
        ' long">ver</span><span class="syll short">ti</span><span'
        ' class="syll short">tur:</span></span><span class="word"><span'
        ' class="syll long">[hanc</span></span><span class="word"><span'
        ' class="syll long">nē</span><span class="syll'
        ' long">mō</span></span><span class="word"><span class="syll'
        ' long">cer</span><span class="syll long">tō</span></span><span'
        ' class="word"><span class="syll short">ti</span><span class="syll'
        ' short">bi</span></span><span class="word"><span class="syll'
        ' long">dī</span><span class="syll short">ce</span><span class="syll'
        ' short">re</span></span><span class="word"><span class="syll'
        ' long">pos</span><span class="syll long">sit,</span></span></div>',
        'lxml'
    )


def test_get_reading_from_line_element(aratea_element):
    reading = azm.corpus.get_reading_from_line_element(aratea_element)
    verse = 'vertitur: [hanc nemo certo tibi dicere possit,'

    assert len(reading.tokens) == 7

    assert reading.tokens[0].text == 'vertitur:'
    assert reading.tokens[1].text == '[hanc'
    assert reading.tokens[2].text == 'nēmō'
    assert reading.tokens[3].text == 'certō'
    assert reading.tokens[4].text == 'tibi'
    assert reading.tokens[5].text == 'dīcere'
    assert reading.tokens[6].text == 'possit,'

    assert reading.tokens[0].span == [0, 9]
    assert reading.tokens[1].span == [10, 15]
    assert reading.tokens[2].span == [16, 20]
    assert reading.tokens[3].span == [21, 26]
    assert reading.tokens[4].span == [27, 31]
    assert reading.tokens[5].span == [32, 38]
    assert reading.tokens[6].span == [39, 46]

    assert len(reading.tokens[0].syllables) == 3
    assert reading.tokens[0].syllables[0].text == 'ver'
    assert reading.tokens[0].syllables[1].text == 'ti'
    assert reading.tokens[0].syllables[2].text == 'tur:'


def test_separate_punctuation(aratea_element):
    reading = azm.corpus.get_reading_from_line_element(aratea_element)
    reading.tokens = azm.corpus.separate_punctuation(reading.tokens)
    verse = 'vertitur: [hanc nemo certo tibi dicere possit,'

    assert len(reading.tokens) == 10

    assert reading.tokens[0].text == 'vertitur'
    assert reading.tokens[1].text == ':'
    assert reading.tokens[2].text == '['
    assert reading.tokens[3].text == 'hanc'
    assert reading.tokens[4].text == 'nēmō'
    assert reading.tokens[5].text == 'certō'
    assert reading.tokens[6].text == 'tibi'
    assert reading.tokens[7].text == 'dīcere'
    assert reading.tokens[8].text == 'possit'
    assert reading.tokens[9].text == ','

    assert reading.tokens[0].span == [0, 8]
    assert reading.tokens[1].span == [8, 9]
    assert reading.tokens[2].span == [10, 11]
    assert reading.tokens[3].span == [11, 15]
    assert reading.tokens[4].span == [16, 20]
    assert reading.tokens[5].span == [21, 26]
    assert reading.tokens[6].span == [27, 31]
    assert reading.tokens[7].span == [32, 38]
    assert reading.tokens[8].span == [39, 45]
    assert reading.tokens[9].span == [45, 46]

    assert len(reading.tokens[0].syllables) == 3
    assert reading.tokens[0].syllables[0].text == 'ver'
    assert reading.tokens[0].syllables[0].span == [0, 3]
    assert reading.tokens[0].syllables[1].text == 'ti'
    assert reading.tokens[0].syllables[1].span == [3, 5]
    assert reading.tokens[0].syllables[2].text == 'tur'
    assert reading.tokens[0].syllables[2].span == [5, 8]

    assert reading.tokens[1].syllables == []
    assert reading.tokens[2].syllables == []

    assert len(reading.tokens[3].syllables) == 1
    assert reading.tokens[3].syllables[0].text == 'hanc'
    assert reading.tokens[3].syllables[0].span == [11, 15]

    assert len(reading.tokens[4].syllables) == 2
    assert reading.tokens[4].syllables[0].text == ''
    assert reading.tokens[4].syllables[0].span == [16, 18]
    assert reading.tokens[4].syllables[1].text == ''
    assert reading.tokens[4].syllables[1].span == [18, 20]