Commit 1be725b2 authored by Simon Will's avatar Simon Will
Browse files

Add module for reading Hypotactic Corpus

parent 5ba4bba0
Loading
Loading
Loading
Loading
+104 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-

import os.path

from bs4 import BeautifulSoup

from .model import Reading, Syllable, Token


class HypotacticLine:

    def __init__(self, element):
        self.element = element
        tokens = []
        span_begin = 0
        idx = 0
        for token_tag in element.children:
            syllables = []
            token_text = token_tag.text
            token = Token(
                token=token_text,
                span=(span_begin, span_begin + len(token_text))
            )

            for syllable_tag in token_tag.children:
                syllable_text = syllable_tag.text
                if 'long' in syllable_tag.attrs['class']:
                    syllable_length = 2
                elif 'short' in syllable_tag.attrs['class']:
                    syllable_length = 1
                elif 'elided' in syllable_tag.attrs['class']:
                    syllable_length = 0
                else:
                    raise ValueError(
                        'Could not determine syllable length of syllable {!r}'
                        .format(syllable_tag)
                    )
                syllable = Syllable(
                    idx=idx,
                    syllable=syllable_text,
                    span=(span_begin, span_begin + len(syllable_text)),
                    syllable_length=syllable_length,
                    vowel_length=None
                )
                idx += 1
                syllables.append(syllable)
                span_begin += len(syllable_text)

            token.syllables = syllables
            tokens.append(token)

        self.reading = Reading(tokens=tokens)


class HypotacticDocument:

    def __init__(self, file_path, parser='lxml'):
        with open(file_path) as f:
            self.root = BeautifulSoup(f, parser)
        self.title = self.root.title

    def get_poems(self, filters=()):
        yield from (
            p
            for p in self.root.find_all(name='div', class_='poem')
            if all(fil(p) for fil in filters)
        )

    def get_lines(self, line_filters=(), poem_filters=()):
        yield from (
            line
            for poem in self.get_poems(poem_filters)
            for line in poem.find_all(name='div', class_='line')
            if all(fil(line) for fil in line_filters)
        )


class HypotacticCorpus:

    def __init__(self, file_paths, parser='lxml'):
        self.file_paths = file_paths
        self.parser = parser
        self.documents = [HypotacticDocument(p, parser=parser)
                          for p in file_paths]

    @classmethod
    def from_directory(cls, directory, *args, **kwargs):
        file_paths = [os.path.abspath(os.path.join(directory, basename))
                      for basename in os.listdir(directory)]
        return cls(file_paths, *args, **kwargs)

    def get_poems(self, filters=()):
        yield from (
            p
            for doc in self.documents
            for p in doc.get_poems(filters)
        )

    def get_lines(self, line_filters=(), poem_filters=()):
        yield from (
            p
            for doc in self.documents
            for p in doc.get_lines(line_filters, poem_filters)
        )