Loading allzweckmesser/corpus.py 0 → 100644 +104 −0 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import os.path from bs4 import BeautifulSoup from .model import Reading, Syllable, Token class HypotacticLine: def __init__(self, element): self.element = element tokens = [] span_begin = 0 idx = 0 for token_tag in element.children: syllables = [] token_text = token_tag.text token = Token( token=token_text, span=(span_begin, span_begin + len(token_text)) ) for syllable_tag in token_tag.children: syllable_text = syllable_tag.text if 'long' in syllable_tag.attrs['class']: syllable_length = 2 elif 'short' in syllable_tag.attrs['class']: syllable_length = 1 elif 'elided' in syllable_tag.attrs['class']: syllable_length = 0 else: raise ValueError( 'Could not determine syllable length of syllable {!r}' .format(syllable_tag) ) syllable = Syllable( idx=idx, syllable=syllable_text, span=(span_begin, span_begin + len(syllable_text)), syllable_length=syllable_length, vowel_length=None ) idx += 1 syllables.append(syllable) span_begin += len(syllable_text) token.syllables = syllables tokens.append(token) self.reading = Reading(tokens=tokens) class HypotacticDocument: def __init__(self, file_path, parser='lxml'): with open(file_path) as f: self.root = BeautifulSoup(f, parser) self.title = self.root.title def get_poems(self, filters=()): yield from ( p for p in self.root.find_all(name='div', class_='poem') if all(fil(p) for fil in filters) ) def get_lines(self, line_filters=(), poem_filters=()): yield from ( line for poem in self.get_poems(poem_filters) for line in poem.find_all(name='div', class_='line') if all(fil(line) for fil in line_filters) ) class HypotacticCorpus: def __init__(self, file_paths, parser='lxml'): self.file_paths = file_paths self.parser = parser self.documents = [HypotacticDocument(p, parser=parser) for p in file_paths] @classmethod def from_directory(cls, directory, *args, **kwargs): file_paths = [os.path.abspath(os.path.join(directory, basename)) for basename in os.listdir(directory)] return cls(file_paths, *args, **kwargs) def get_poems(self, filters=()): yield from ( p for doc in self.documents for p in doc.get_poems(filters) ) def get_lines(self, line_filters=(), poem_filters=()): yield from ( p for doc in self.documents for p in doc.get_lines(line_filters, poem_filters) ) Loading
allzweckmesser/corpus.py 0 → 100644 +104 −0 Original line number Diff line number Diff line # -*- coding: utf-8 -*- import os.path from bs4 import BeautifulSoup from .model import Reading, Syllable, Token class HypotacticLine: def __init__(self, element): self.element = element tokens = [] span_begin = 0 idx = 0 for token_tag in element.children: syllables = [] token_text = token_tag.text token = Token( token=token_text, span=(span_begin, span_begin + len(token_text)) ) for syllable_tag in token_tag.children: syllable_text = syllable_tag.text if 'long' in syllable_tag.attrs['class']: syllable_length = 2 elif 'short' in syllable_tag.attrs['class']: syllable_length = 1 elif 'elided' in syllable_tag.attrs['class']: syllable_length = 0 else: raise ValueError( 'Could not determine syllable length of syllable {!r}' .format(syllable_tag) ) syllable = Syllable( idx=idx, syllable=syllable_text, span=(span_begin, span_begin + len(syllable_text)), syllable_length=syllable_length, vowel_length=None ) idx += 1 syllables.append(syllable) span_begin += len(syllable_text) token.syllables = syllables tokens.append(token) self.reading = Reading(tokens=tokens) class HypotacticDocument: def __init__(self, file_path, parser='lxml'): with open(file_path) as f: self.root = BeautifulSoup(f, parser) self.title = self.root.title def get_poems(self, filters=()): yield from ( p for p in self.root.find_all(name='div', class_='poem') if all(fil(p) for fil in filters) ) def get_lines(self, line_filters=(), poem_filters=()): yield from ( line for poem in self.get_poems(poem_filters) for line in poem.find_all(name='div', class_='line') if all(fil(line) for fil in line_filters) ) class HypotacticCorpus: def __init__(self, file_paths, parser='lxml'): self.file_paths = file_paths self.parser = parser self.documents = [HypotacticDocument(p, parser=parser) for p in file_paths] @classmethod def from_directory(cls, directory, *args, **kwargs): file_paths = [os.path.abspath(os.path.join(directory, basename)) for basename in os.listdir(directory)] return cls(file_paths, *args, **kwargs) def get_poems(self, filters=()): yield from ( p for doc in self.documents for p in doc.get_poems(filters) ) def get_lines(self, line_filters=(), poem_filters=()): yield from ( p for doc in self.documents for p in doc.get_lines(line_filters, poem_filters) )