Loading allzweckmesser/corpus.py +97 −41 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import logging import re import os.path from bs4 import BeautifulSoup Loading @@ -19,10 +21,7 @@ BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN" """ class HypotacticLine: def __init__(self, element): self.element = element def get_reading_from_line_element(element): tokens = [] span_begin = 0 idx = 0 Loading @@ -31,10 +30,10 @@ class HypotacticLine: token_text = token_tag.text token = Token( token=token_text, span=(span_begin, span_begin + len(token_text)) span=[span_begin, span_begin + len(token_text)] ) for syllable_tag in token_tag.children: for syllable_tag in token_tag.find_all(name='span', class_='syll'): syllable_text = syllable_tag.text if 'long' in syllable_tag.attrs['class']: syllable_length = 2 Loading @@ -50,18 +49,75 @@ class HypotacticLine: syllable = Syllable( idx=idx, syllable=syllable_text, span=(span_begin, span_begin + len(syllable_text)), span=[span_begin, span_begin + len(syllable_text)], syllable_length=syllable_length, vowel_length=None ) idx += 1 syllables.append(syllable) span_begin += len(syllable_text) # The + 1 is for simulating a space between tokens. span_begin += 1 token.syllables = syllables tokens.append(token) self.reading = Reading(tokens=tokens) return Reading(tokens=tokens) def separate_punctuation(tokens): i = 0 while i < len(tokens): token = tokens[i] m = re.match(r'^(?P<pre_punct>[\W_]*)(?P<non_punct>\w*)' '(?P<post_punct>[\W_]*)$', token.text) if m: pre = m.group('pre_punct') post = m.group('post_punct') # Create tokens for the punctuation before a token. span_begin = token.span[0] for c in pre: tokens.insert(i, Token(c, [span_begin, span_begin + 1])) span_begin += 1 i += 1 # Create tokens for the punctuation after a token. span_begin = token.span[1] - len(post) for c in m.group('post_punct'): tokens.insert(i + 1, Token(c, [span_begin, span_begin + 1])) span_begin += 1 i += 1 # Remove the punctuation from the original token and # from its syllables. token.text = m.group('non_punct') span_begin = token.span[0] + m.start('non_punct') span_end = token.span[1] - len(post) token.span = [span_begin, span_end] if pre: token.syllables[0].text = token.syllables[0].text[len(pre):] token.syllables[0].span[0] = span_begin if post: token.syllables[-1].text = token.syllables[-1].text[:-len(post)] token.syllables[-1].span[1] = span_end else: logging.warn('{!r} does not match the punctuation regex.' .format(token)) i += 1 return tokens class HypotacticLine: def __init__(self, element): self.element = element self.reading = get_reading_from_line_element(element) reading.tokens = separate_punctuation(reading.tokens) class HypotacticDocument: Loading Loading @@ -138,7 +194,7 @@ class HypotacticCorpus: for line in doc.get_lines_with_meter(meters) ) def save_lines(self, file_handle, lines, title='Saved Poems', def save_html_tags(self, file_handle, tags, title='Saved Poems', base_html=BASE_HTML, pretty=False): soup = BeautifulSoup(base_html, self.parser) Loading @@ -148,8 +204,8 @@ class HypotacticCorpus: latin = soup.new_tag('div') latin.attrs['class'] = 'latin' for line in lines: latin.append(line) for tag in tags: latin.append(tag) soup.find(name='body').append(latin) if pretty: Loading scripts/extract_lines.py +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ def main(hypotactic_dir, outfile, limit=0, title='Lines', meters=tuple()): if limit: line_generator = itertools.islice(line_generator, limit) with open(outfile, 'w') as f: corpus.save_lines(f, line_generator, title=title) corpus.save_html_tags(f, line_generator, title=title) def parse_args_and_main(): Loading tests/test_corpus.py 0 → 100644 +106 −0 Original line number Diff line number Diff line # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import pytest import allzweckmesser as azm @pytest.fixture def aratea_element(): # The line is: # 'vertitur: [hanc nemo certo tibi dicere possit,' return BeautifulSoup( '<div class="line"><span class="word"><span class="syll' ' long">ver</span><span class="syll short">ti</span><span' ' class="syll short">tur:</span></span><span class="word"><span' ' class="syll long">[hanc</span></span><span class="word"><span' ' class="syll long">nē</span><span class="syll' ' long">mō</span></span><span class="word"><span class="syll' ' long">cer</span><span class="syll long">tō</span></span><span' ' class="word"><span class="syll short">ti</span><span class="syll' ' short">bi</span></span><span class="word"><span class="syll' ' long">dī</span><span class="syll short">ce</span><span class="syll' ' short">re</span></span><span class="word"><span class="syll' ' long">pos</span><span class="syll long">sit,</span></span></div>', 'lxml' ) def test_get_reading_from_line_element(aratea_element): reading = azm.corpus.get_reading_from_line_element(aratea_element) verse = 'vertitur: [hanc nemo certo tibi dicere possit,' assert len(reading.tokens) == 7 assert reading.tokens[0].text == 'vertitur:' assert reading.tokens[1].text == '[hanc' assert reading.tokens[2].text == 'nēmō' assert reading.tokens[3].text == 'certō' assert reading.tokens[4].text == 'tibi' assert reading.tokens[5].text == 'dīcere' assert reading.tokens[6].text == 'possit,' assert reading.tokens[0].span == [0, 9] assert reading.tokens[1].span == [10, 15] assert reading.tokens[2].span == [16, 20] assert reading.tokens[3].span == [21, 26] assert reading.tokens[4].span == [27, 31] assert reading.tokens[5].span == [32, 38] assert reading.tokens[6].span == [39, 46] assert len(reading.tokens[0].syllables) == 3 assert reading.tokens[0].syllables[0].text == 'ver' assert reading.tokens[0].syllables[1].text == 'ti' assert reading.tokens[0].syllables[2].text == 'tur:' def test_separate_punctuation(aratea_element): reading = azm.corpus.get_reading_from_line_element(aratea_element) reading.tokens = azm.corpus.separate_punctuation(reading.tokens) verse = 'vertitur: [hanc nemo certo tibi dicere possit,' assert len(reading.tokens) == 10 assert reading.tokens[0].text == 'vertitur' assert reading.tokens[1].text == ':' assert reading.tokens[2].text == '[' assert reading.tokens[3].text == 'hanc' assert reading.tokens[4].text == 'nēmō' assert reading.tokens[5].text == 'certō' assert reading.tokens[6].text == 'tibi' assert reading.tokens[7].text == 'dīcere' assert reading.tokens[8].text == 'possit' assert reading.tokens[9].text == ',' assert reading.tokens[0].span == [0, 8] assert reading.tokens[1].span == [8, 9] assert reading.tokens[2].span == [10, 11] assert reading.tokens[3].span == [11, 15] assert reading.tokens[4].span == [16, 20] assert reading.tokens[5].span == [21, 26] assert reading.tokens[6].span == [27, 31] assert reading.tokens[7].span == [32, 38] assert reading.tokens[8].span == [39, 45] assert reading.tokens[9].span == [45, 46] assert len(reading.tokens[0].syllables) == 3 assert reading.tokens[0].syllables[0].text == 'ver' assert reading.tokens[0].syllables[0].span == [0, 3] assert reading.tokens[0].syllables[1].text == 'ti' assert reading.tokens[0].syllables[1].span == [3, 5] assert reading.tokens[0].syllables[2].text == 'tur' assert reading.tokens[0].syllables[2].span == [5, 8] assert reading.tokens[1].syllables == [] assert reading.tokens[2].syllables == [] assert len(reading.tokens[3].syllables) == 1 assert reading.tokens[3].syllables[0].text == 'hanc' assert reading.tokens[3].syllables[0].span == [11, 15] assert len(reading.tokens[4].syllables) == 2 assert reading.tokens[4].syllables[0].text == 'nē' assert reading.tokens[4].syllables[0].span == [16, 18] assert reading.tokens[4].syllables[1].text == 'mō' assert reading.tokens[4].syllables[1].span == [18, 20] Loading
allzweckmesser/corpus.py +97 −41 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import logging import re import os.path from bs4 import BeautifulSoup Loading @@ -19,10 +21,7 @@ BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN" """ class HypotacticLine: def __init__(self, element): self.element = element def get_reading_from_line_element(element): tokens = [] span_begin = 0 idx = 0 Loading @@ -31,10 +30,10 @@ class HypotacticLine: token_text = token_tag.text token = Token( token=token_text, span=(span_begin, span_begin + len(token_text)) span=[span_begin, span_begin + len(token_text)] ) for syllable_tag in token_tag.children: for syllable_tag in token_tag.find_all(name='span', class_='syll'): syllable_text = syllable_tag.text if 'long' in syllable_tag.attrs['class']: syllable_length = 2 Loading @@ -50,18 +49,75 @@ class HypotacticLine: syllable = Syllable( idx=idx, syllable=syllable_text, span=(span_begin, span_begin + len(syllable_text)), span=[span_begin, span_begin + len(syllable_text)], syllable_length=syllable_length, vowel_length=None ) idx += 1 syllables.append(syllable) span_begin += len(syllable_text) # The + 1 is for simulating a space between tokens. span_begin += 1 token.syllables = syllables tokens.append(token) self.reading = Reading(tokens=tokens) return Reading(tokens=tokens) def separate_punctuation(tokens): i = 0 while i < len(tokens): token = tokens[i] m = re.match(r'^(?P<pre_punct>[\W_]*)(?P<non_punct>\w*)' '(?P<post_punct>[\W_]*)$', token.text) if m: pre = m.group('pre_punct') post = m.group('post_punct') # Create tokens for the punctuation before a token. span_begin = token.span[0] for c in pre: tokens.insert(i, Token(c, [span_begin, span_begin + 1])) span_begin += 1 i += 1 # Create tokens for the punctuation after a token. span_begin = token.span[1] - len(post) for c in m.group('post_punct'): tokens.insert(i + 1, Token(c, [span_begin, span_begin + 1])) span_begin += 1 i += 1 # Remove the punctuation from the original token and # from its syllables. token.text = m.group('non_punct') span_begin = token.span[0] + m.start('non_punct') span_end = token.span[1] - len(post) token.span = [span_begin, span_end] if pre: token.syllables[0].text = token.syllables[0].text[len(pre):] token.syllables[0].span[0] = span_begin if post: token.syllables[-1].text = token.syllables[-1].text[:-len(post)] token.syllables[-1].span[1] = span_end else: logging.warn('{!r} does not match the punctuation regex.' .format(token)) i += 1 return tokens class HypotacticLine: def __init__(self, element): self.element = element self.reading = get_reading_from_line_element(element) reading.tokens = separate_punctuation(reading.tokens) class HypotacticDocument: Loading Loading @@ -138,7 +194,7 @@ class HypotacticCorpus: for line in doc.get_lines_with_meter(meters) ) def save_lines(self, file_handle, lines, title='Saved Poems', def save_html_tags(self, file_handle, tags, title='Saved Poems', base_html=BASE_HTML, pretty=False): soup = BeautifulSoup(base_html, self.parser) Loading @@ -148,8 +204,8 @@ class HypotacticCorpus: latin = soup.new_tag('div') latin.attrs['class'] = 'latin' for line in lines: latin.append(line) for tag in tags: latin.append(tag) soup.find(name='body').append(latin) if pretty: Loading
scripts/extract_lines.py +1 −1 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ def main(hypotactic_dir, outfile, limit=0, title='Lines', meters=tuple()): if limit: line_generator = itertools.islice(line_generator, limit) with open(outfile, 'w') as f: corpus.save_lines(f, line_generator, title=title) corpus.save_html_tags(f, line_generator, title=title) def parse_args_and_main(): Loading
tests/test_corpus.py 0 → 100644 +106 −0 Original line number Diff line number Diff line # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import pytest import allzweckmesser as azm @pytest.fixture def aratea_element(): # The line is: # 'vertitur: [hanc nemo certo tibi dicere possit,' return BeautifulSoup( '<div class="line"><span class="word"><span class="syll' ' long">ver</span><span class="syll short">ti</span><span' ' class="syll short">tur:</span></span><span class="word"><span' ' class="syll long">[hanc</span></span><span class="word"><span' ' class="syll long">nē</span><span class="syll' ' long">mō</span></span><span class="word"><span class="syll' ' long">cer</span><span class="syll long">tō</span></span><span' ' class="word"><span class="syll short">ti</span><span class="syll' ' short">bi</span></span><span class="word"><span class="syll' ' long">dī</span><span class="syll short">ce</span><span class="syll' ' short">re</span></span><span class="word"><span class="syll' ' long">pos</span><span class="syll long">sit,</span></span></div>', 'lxml' ) def test_get_reading_from_line_element(aratea_element): reading = azm.corpus.get_reading_from_line_element(aratea_element) verse = 'vertitur: [hanc nemo certo tibi dicere possit,' assert len(reading.tokens) == 7 assert reading.tokens[0].text == 'vertitur:' assert reading.tokens[1].text == '[hanc' assert reading.tokens[2].text == 'nēmō' assert reading.tokens[3].text == 'certō' assert reading.tokens[4].text == 'tibi' assert reading.tokens[5].text == 'dīcere' assert reading.tokens[6].text == 'possit,' assert reading.tokens[0].span == [0, 9] assert reading.tokens[1].span == [10, 15] assert reading.tokens[2].span == [16, 20] assert reading.tokens[3].span == [21, 26] assert reading.tokens[4].span == [27, 31] assert reading.tokens[5].span == [32, 38] assert reading.tokens[6].span == [39, 46] assert len(reading.tokens[0].syllables) == 3 assert reading.tokens[0].syllables[0].text == 'ver' assert reading.tokens[0].syllables[1].text == 'ti' assert reading.tokens[0].syllables[2].text == 'tur:' def test_separate_punctuation(aratea_element): reading = azm.corpus.get_reading_from_line_element(aratea_element) reading.tokens = azm.corpus.separate_punctuation(reading.tokens) verse = 'vertitur: [hanc nemo certo tibi dicere possit,' assert len(reading.tokens) == 10 assert reading.tokens[0].text == 'vertitur' assert reading.tokens[1].text == ':' assert reading.tokens[2].text == '[' assert reading.tokens[3].text == 'hanc' assert reading.tokens[4].text == 'nēmō' assert reading.tokens[5].text == 'certō' assert reading.tokens[6].text == 'tibi' assert reading.tokens[7].text == 'dīcere' assert reading.tokens[8].text == 'possit' assert reading.tokens[9].text == ',' assert reading.tokens[0].span == [0, 8] assert reading.tokens[1].span == [8, 9] assert reading.tokens[2].span == [10, 11] assert reading.tokens[3].span == [11, 15] assert reading.tokens[4].span == [16, 20] assert reading.tokens[5].span == [21, 26] assert reading.tokens[6].span == [27, 31] assert reading.tokens[7].span == [32, 38] assert reading.tokens[8].span == [39, 45] assert reading.tokens[9].span == [45, 46] assert len(reading.tokens[0].syllables) == 3 assert reading.tokens[0].syllables[0].text == 'ver' assert reading.tokens[0].syllables[0].span == [0, 3] assert reading.tokens[0].syllables[1].text == 'ti' assert reading.tokens[0].syllables[1].span == [3, 5] assert reading.tokens[0].syllables[2].text == 'tur' assert reading.tokens[0].syllables[2].span == [5, 8] assert reading.tokens[1].syllables == [] assert reading.tokens[2].syllables == [] assert len(reading.tokens[3].syllables) == 1 assert reading.tokens[3].syllables[0].text == 'hanc' assert reading.tokens[3].syllables[0].span == [11, 15] assert len(reading.tokens[4].syllables) == 2 assert reading.tokens[4].syllables[0].text == 'nē' assert reading.tokens[4].syllables[0].span == [16, 18] assert reading.tokens[4].syllables[1].text == 'mō' assert reading.tokens[4].syllables[1].span == [18, 20]