Loading allzweckmesser/db.py +4 −4 Original line number Diff line number Diff line Loading @@ -58,10 +58,10 @@ class FormAnalysis(BASE): return repr(self) def __eq__(self, other): return (self.form == other.form and self.morphtag == other.morphtag and self.lemma == other.lemma and self.accented == other.accented) return (isinstance(other, FormAnalysis) and ((self.form, self.morphtag, self.lemma, self.accented) == (other.form, other.morphtag, other.lemma, other.accented))) def __hash__(self): return hash((self.form, self.morphtag, self.lemma, self.accented)) Loading allzweckmesser/model.py +40 −17 Original line number Diff line number Diff line Loading @@ -3,8 +3,8 @@ import json import os import sys from typing import List import re from typing import Dict, List, Set def check_format(json_file, check_for=dict): Loading Loading @@ -255,18 +255,21 @@ class MultisyllablePhenomenon(Phenomenon): class Token: def __init__(self, token: str, span: List[int], syllables: List[Syllable] = list(), analysis: str = None, clitic: str = None): syllables: List[Syllable] = None, clitic: str = None, accented: str = None, lemma_to_morphtags: Dict[str, Set[str]] = None): if len(token) != span[1]-span[0]: raise ValueError('Length of token {} does not match span {}.' .format(token, span)) else: self.text = token self.span = span self.syllables = syllables self.analysis = analysis self.syllables = syllables or list() self.clitic = clitic self.accented = accented self.lemma_to_morphtags = lemma_to_morphtags @classmethod def from_json(cls, json_file): raw = check_format(json_file) Loading @@ -281,7 +284,6 @@ class Token: token.clitic = raw['clitic'] if 'syllables' in raw: token.syllables = list() for syllable in raw['syllables']: token.syllables.append(Syllable.from_json(syllable)) Loading Loading @@ -316,9 +318,9 @@ class Token: class Reading: def __init__(self, tokens: List[Token], phenomena: dict = dict()): self.tokens = tokens self.phenomena = phenomena def __init__(self, tokens: List[Token] = None, phenomena: dict = None): self.tokens = tokens or list() self.phenomena = phenomena or dict() @classmethod def from_json(cls, json_file): Loading @@ -333,7 +335,6 @@ class Reading: reading = cls(tokens) if 'phenomena' in raw: reading.phenomena = dict() for phenomenon in raw['phenomena'].items(): key, value = phenomenon for v in value: Loading @@ -360,14 +361,31 @@ class Reading: return json.dumps(self.to_dict()) def __len__(self): return len(self.tokens) def append_token(self, token: Token): self.tokens.append(token) def __str__(self): forms = [ t.accented if t.accented is not None else t.text for t in self.tokens ] return ' '.join(forms) def __repr__(self): # TODO: Implement this properly. return str(self) class Verse: def __init__(self, verse: str, readings: List[Reading] = list(), source: dict = None): def __init__(self, verse: str, source: dict = None, readings: List[Reading] = None): self.text = verse self.source = source self.readings = readings self.readings = readings or list() @classmethod def from_plain_verse(cls, plain_verse): Loading @@ -387,7 +405,6 @@ class Verse: source['place'] = raw['source']['place'] verse = cls(text, source=source) verse.readings = list() for reading in raw['readings']: verse.readings.append(Reading.from_json(reading)) Loading @@ -408,3 +425,9 @@ class Verse: def to_json(self): return json.dumps(self.to_dict()) def __str__(self): s = 'Verse: {verse}\n{reading_num} Readings:\n{readings}' readings_str = '\n'.join(str(r) for r in self.readings) return s.format(verse=self.text, reading_num=len(self.readings), readings=readings_str) allzweckmesser/scan.py +12 −8 Original line number Diff line number Diff line Loading @@ -6,11 +6,15 @@ import sys from typing import List from .meters import ALL_METERS from .scanner import Scanner def scan(verses: List[str], meters=ALL_METERS, **options): def scan(plain_verses: List[str], meters=ALL_METERS, **options): """Scan Latin verses.""" pass scanner = Scanner() scanned_verses = scanner.scan_verses(plain_verses) for v in scanned_verses: print(v) def parse_args() -> argparse.Namespace: Loading @@ -28,7 +32,7 @@ def parse_args() -> argparse.Namespace: return args def get_verses(infile: str = None) -> List[str]: def get_plain_verses(infile: str = None) -> List[str]: """Read verses that are to be scanned. If infile is None the verses are read from the standard input. Loading @@ -38,18 +42,18 @@ def get_verses(infile: str = None) -> List[str]: """ if infile: with open(infile) as f: verses = [line.strip() for line in f.readlines()] plain_verses = [line.strip() for line in f.readlines()] else: verses = [line.strip() for line in sys.stdin.readlines()] return verses plain_verses = [line.strip() for line in sys.stdin.readlines()] return plain_verses def main(): """Parse CLI arguments then read and scan verses.""" args = vars(parse_args()) args['verses'] = get_verses(args['infile']) args['plain_verses'] = get_plain_verses(args['infile']) del args['infile'] scan(args) scan(**args) if __name__ == '__main__': Loading allzweckmesser/scanner.py +81 −18 Original line number Diff line number Diff line Loading @@ -2,9 +2,10 @@ import copy import re from typing import List from typing import Dict, List, Set, Tuple from .model import Token from .db import FormAnalysis from .model import Reading, Token, Verse from .wordlist import WordList verses = [ Loading @@ -17,7 +18,14 @@ verses = [ CLITICS = ['que', 'qve', 'ue', 've', 'ne'] def get_clitic(token): def get_clitic(token: str) -> Tuple[str, str]: """Split a clitic from the token if possible. :param token: A token that may contain a clitic. :return: A tuple of token without clitic and clitic, if a clitic was found. Or a tuple of the original token and None if no clitic was found. """ for clitic in CLITICS: if token.endswith(clitic): return token[:-len(clitic)], clitic Loading @@ -25,8 +33,8 @@ def get_clitic(token): return token, None def multiply_readings(readings: List[List[Token]], n: int) -> List[List[Token]]: def multiply_readings(readings: List[Reading], n: int) -> List[Reading]: """Copy the readings n - 1 times. :param readings: The readings that are to be multiplied. Loading @@ -36,13 +44,24 @@ def multiply_readings(readings: List[List[Token]], orig_readings_len = len(readings) for _ in range(n - 1): for i in range(orig_readings_len): new_reading = [copy.copy(token) for token in readings[i]] # TODO: Think about moving this to Reading in model.py new_reading = Reading( [copy.copy(token) for token in readings[i].tokens] ) readings.append(new_reading) return readings def tokenize(plain_verse): def tokenize(plain_verse: str) -> List[Token]: """Tokenize a verse. This function first splits on whitespace and then further on punctuation. Punctuation marks are regarded as tokens and are therefore included in the list of returned tokens. :param plain_verse: The verse that is to be tokenized. :return: A list of the found tokens. """ tokens = [] i = 0 # Index into the whole verse. for token in re.split(r'\s', plain_verse): Loading Loading @@ -77,9 +96,45 @@ def tokenize(plain_verse): return tokens def lemmatize_verses(word_list, tokens): def condense_analyses( analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]: """Condense analyses objects into a nested dict representation. :param analyses: The analyses that are to be condensed. :return: A condensed version of the analyses. The keys in the outer dict are the accented forms, the keys in the inner dict are lemmas and the strings in the set are the morphtags. """ condensed = {} for a in analyses: if a.accented in condensed: if a.lemma in condensed[a.accented]: condensed[a.accented][a.lemma].add(a.morphtag) else: condensed[a.accented][a.lemma] = {a.morphtag} else: condensed[a.accented] = {a.lemma: {a.morphtag}} return condensed def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: """Find different possible readings by analyzing the word forms. This function analyzes the word forms in the verse and creates readings for all possible combinations of accented versions of the words. E.g. if two words occur with more than one accented version, say one with two accented versions and the other with three accented versions, a total of six readings will be generated. :param word_list: The word list to look up the word forms. :param reading: A basic reading of a verse that is to be analyzed. :return: A list of readings of the verse that differ with respect to the accented versions for the forms. """ token_alternatives = [] for token in tokens: for token in reading.tokens: if token.is_punct(): analyses = None else: Loading @@ -90,34 +145,42 @@ def lemmatize_verses(word_list, tokens): token.clitic = clitic analyses = word_list.analyze(bare) if analyses: alternatives = [] for a in analyses: if analyses: condensed_analyses = condense_analyses(analyses) for accented, lemma_to_morphtags in condensed_analyses.items(): # The token should not have any syllables at this # point so that the question of copy vs deepcopy # does not even arise. t = copy.copy(token) t.analysis = a t.accented = accented t.lemma_to_morphtags = lemma_to_morphtags alternatives.append(t) else: alternatives.append(token) token_alternatives.append(alternatives) readings = [[]] readings = [Reading()] for alternatives in token_alternatives: orig_readings_len = len(readings) readings = multiply_readings(readings, len(alternatives)) for i, token in enumerate(alternatives): start = i * orig_readings_len for reading in readings[start:start+orig_readings_len]: reading.append(token) reading.append_token(token) return readings class Scanner: def __init__(self, plain_verses): def __init__(self): self.word_list = WordList() self.plain_verses = plain_verses self.tokenized_verses = [tokenize(v) for v in self.plain_verses] def scan_verses(self, plain_verses: List[str]): base_readings = [Reading(tokens=tokenize(v)) for v in plain_verses] verses = [ Verse(verse=v, readings=lemmatize(self.word_list, br)) for v, br in zip(plain_verses, base_readings) ] return verses tests/test_scan.py +3 −3 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import allzweckmesser TEST_DIR = os.path.dirname(__file__) def test_get_verses(): def test_get_plain_verses(): correct = [ 'nunc dum tibi lubet licetque pota perde rem', 'antehac est habitus parcus nec magis continens', Loading @@ -15,5 +15,5 @@ def test_get_verses(): 'an, quod ubique, tuum est? tua sunt Heliconia Tempe?', ] test_infile = os.path.join(TEST_DIR, 'verses.txt') verses = allzweckmesser.scan.get_verses(test_infile) assert verses == correct plain_verses = allzweckmesser.scan.get_plain_verses(test_infile) assert plain_verses == correct Loading
allzweckmesser/db.py +4 −4 Original line number Diff line number Diff line Loading @@ -58,10 +58,10 @@ class FormAnalysis(BASE): return repr(self) def __eq__(self, other): return (self.form == other.form and self.morphtag == other.morphtag and self.lemma == other.lemma and self.accented == other.accented) return (isinstance(other, FormAnalysis) and ((self.form, self.morphtag, self.lemma, self.accented) == (other.form, other.morphtag, other.lemma, other.accented))) def __hash__(self): return hash((self.form, self.morphtag, self.lemma, self.accented)) Loading
allzweckmesser/model.py +40 −17 Original line number Diff line number Diff line Loading @@ -3,8 +3,8 @@ import json import os import sys from typing import List import re from typing import Dict, List, Set def check_format(json_file, check_for=dict): Loading Loading @@ -255,18 +255,21 @@ class MultisyllablePhenomenon(Phenomenon): class Token: def __init__(self, token: str, span: List[int], syllables: List[Syllable] = list(), analysis: str = None, clitic: str = None): syllables: List[Syllable] = None, clitic: str = None, accented: str = None, lemma_to_morphtags: Dict[str, Set[str]] = None): if len(token) != span[1]-span[0]: raise ValueError('Length of token {} does not match span {}.' .format(token, span)) else: self.text = token self.span = span self.syllables = syllables self.analysis = analysis self.syllables = syllables or list() self.clitic = clitic self.accented = accented self.lemma_to_morphtags = lemma_to_morphtags @classmethod def from_json(cls, json_file): raw = check_format(json_file) Loading @@ -281,7 +284,6 @@ class Token: token.clitic = raw['clitic'] if 'syllables' in raw: token.syllables = list() for syllable in raw['syllables']: token.syllables.append(Syllable.from_json(syllable)) Loading Loading @@ -316,9 +318,9 @@ class Token: class Reading: def __init__(self, tokens: List[Token], phenomena: dict = dict()): self.tokens = tokens self.phenomena = phenomena def __init__(self, tokens: List[Token] = None, phenomena: dict = None): self.tokens = tokens or list() self.phenomena = phenomena or dict() @classmethod def from_json(cls, json_file): Loading @@ -333,7 +335,6 @@ class Reading: reading = cls(tokens) if 'phenomena' in raw: reading.phenomena = dict() for phenomenon in raw['phenomena'].items(): key, value = phenomenon for v in value: Loading @@ -360,14 +361,31 @@ class Reading: return json.dumps(self.to_dict()) def __len__(self): return len(self.tokens) def append_token(self, token: Token): self.tokens.append(token) def __str__(self): forms = [ t.accented if t.accented is not None else t.text for t in self.tokens ] return ' '.join(forms) def __repr__(self): # TODO: Implement this properly. return str(self) class Verse: def __init__(self, verse: str, readings: List[Reading] = list(), source: dict = None): def __init__(self, verse: str, source: dict = None, readings: List[Reading] = None): self.text = verse self.source = source self.readings = readings self.readings = readings or list() @classmethod def from_plain_verse(cls, plain_verse): Loading @@ -387,7 +405,6 @@ class Verse: source['place'] = raw['source']['place'] verse = cls(text, source=source) verse.readings = list() for reading in raw['readings']: verse.readings.append(Reading.from_json(reading)) Loading @@ -408,3 +425,9 @@ class Verse: def to_json(self): return json.dumps(self.to_dict()) def __str__(self): s = 'Verse: {verse}\n{reading_num} Readings:\n{readings}' readings_str = '\n'.join(str(r) for r in self.readings) return s.format(verse=self.text, reading_num=len(self.readings), readings=readings_str)
allzweckmesser/scan.py +12 −8 Original line number Diff line number Diff line Loading @@ -6,11 +6,15 @@ import sys from typing import List from .meters import ALL_METERS from .scanner import Scanner def scan(verses: List[str], meters=ALL_METERS, **options): def scan(plain_verses: List[str], meters=ALL_METERS, **options): """Scan Latin verses.""" pass scanner = Scanner() scanned_verses = scanner.scan_verses(plain_verses) for v in scanned_verses: print(v) def parse_args() -> argparse.Namespace: Loading @@ -28,7 +32,7 @@ def parse_args() -> argparse.Namespace: return args def get_verses(infile: str = None) -> List[str]: def get_plain_verses(infile: str = None) -> List[str]: """Read verses that are to be scanned. If infile is None the verses are read from the standard input. Loading @@ -38,18 +42,18 @@ def get_verses(infile: str = None) -> List[str]: """ if infile: with open(infile) as f: verses = [line.strip() for line in f.readlines()] plain_verses = [line.strip() for line in f.readlines()] else: verses = [line.strip() for line in sys.stdin.readlines()] return verses plain_verses = [line.strip() for line in sys.stdin.readlines()] return plain_verses def main(): """Parse CLI arguments then read and scan verses.""" args = vars(parse_args()) args['verses'] = get_verses(args['infile']) args['plain_verses'] = get_plain_verses(args['infile']) del args['infile'] scan(args) scan(**args) if __name__ == '__main__': Loading
allzweckmesser/scanner.py +81 −18 Original line number Diff line number Diff line Loading @@ -2,9 +2,10 @@ import copy import re from typing import List from typing import Dict, List, Set, Tuple from .model import Token from .db import FormAnalysis from .model import Reading, Token, Verse from .wordlist import WordList verses = [ Loading @@ -17,7 +18,14 @@ verses = [ CLITICS = ['que', 'qve', 'ue', 've', 'ne'] def get_clitic(token): def get_clitic(token: str) -> Tuple[str, str]: """Split a clitic from the token if possible. :param token: A token that may contain a clitic. :return: A tuple of token without clitic and clitic, if a clitic was found. Or a tuple of the original token and None if no clitic was found. """ for clitic in CLITICS: if token.endswith(clitic): return token[:-len(clitic)], clitic Loading @@ -25,8 +33,8 @@ def get_clitic(token): return token, None def multiply_readings(readings: List[List[Token]], n: int) -> List[List[Token]]: def multiply_readings(readings: List[Reading], n: int) -> List[Reading]: """Copy the readings n - 1 times. :param readings: The readings that are to be multiplied. Loading @@ -36,13 +44,24 @@ def multiply_readings(readings: List[List[Token]], orig_readings_len = len(readings) for _ in range(n - 1): for i in range(orig_readings_len): new_reading = [copy.copy(token) for token in readings[i]] # TODO: Think about moving this to Reading in model.py new_reading = Reading( [copy.copy(token) for token in readings[i].tokens] ) readings.append(new_reading) return readings def tokenize(plain_verse): def tokenize(plain_verse: str) -> List[Token]: """Tokenize a verse. This function first splits on whitespace and then further on punctuation. Punctuation marks are regarded as tokens and are therefore included in the list of returned tokens. :param plain_verse: The verse that is to be tokenized. :return: A list of the found tokens. """ tokens = [] i = 0 # Index into the whole verse. for token in re.split(r'\s', plain_verse): Loading Loading @@ -77,9 +96,45 @@ def tokenize(plain_verse): return tokens def lemmatize_verses(word_list, tokens): def condense_analyses( analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]: """Condense analyses objects into a nested dict representation. :param analyses: The analyses that are to be condensed. :return: A condensed version of the analyses. The keys in the outer dict are the accented forms, the keys in the inner dict are lemmas and the strings in the set are the morphtags. """ condensed = {} for a in analyses: if a.accented in condensed: if a.lemma in condensed[a.accented]: condensed[a.accented][a.lemma].add(a.morphtag) else: condensed[a.accented][a.lemma] = {a.morphtag} else: condensed[a.accented] = {a.lemma: {a.morphtag}} return condensed def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: """Find different possible readings by analyzing the word forms. This function analyzes the word forms in the verse and creates readings for all possible combinations of accented versions of the words. E.g. if two words occur with more than one accented version, say one with two accented versions and the other with three accented versions, a total of six readings will be generated. :param word_list: The word list to look up the word forms. :param reading: A basic reading of a verse that is to be analyzed. :return: A list of readings of the verse that differ with respect to the accented versions for the forms. """ token_alternatives = [] for token in tokens: for token in reading.tokens: if token.is_punct(): analyses = None else: Loading @@ -90,34 +145,42 @@ def lemmatize_verses(word_list, tokens): token.clitic = clitic analyses = word_list.analyze(bare) if analyses: alternatives = [] for a in analyses: if analyses: condensed_analyses = condense_analyses(analyses) for accented, lemma_to_morphtags in condensed_analyses.items(): # The token should not have any syllables at this # point so that the question of copy vs deepcopy # does not even arise. t = copy.copy(token) t.analysis = a t.accented = accented t.lemma_to_morphtags = lemma_to_morphtags alternatives.append(t) else: alternatives.append(token) token_alternatives.append(alternatives) readings = [[]] readings = [Reading()] for alternatives in token_alternatives: orig_readings_len = len(readings) readings = multiply_readings(readings, len(alternatives)) for i, token in enumerate(alternatives): start = i * orig_readings_len for reading in readings[start:start+orig_readings_len]: reading.append(token) reading.append_token(token) return readings class Scanner: def __init__(self, plain_verses): def __init__(self): self.word_list = WordList() self.plain_verses = plain_verses self.tokenized_verses = [tokenize(v) for v in self.plain_verses] def scan_verses(self, plain_verses: List[str]): base_readings = [Reading(tokens=tokenize(v)) for v in plain_verses] verses = [ Verse(verse=v, readings=lemmatize(self.word_list, br)) for v, br in zip(plain_verses, base_readings) ] return verses
tests/test_scan.py +3 −3 Original line number Diff line number Diff line Loading @@ -7,7 +7,7 @@ import allzweckmesser TEST_DIR = os.path.dirname(__file__) def test_get_verses(): def test_get_plain_verses(): correct = [ 'nunc dum tibi lubet licetque pota perde rem', 'antehac est habitus parcus nec magis continens', Loading @@ -15,5 +15,5 @@ def test_get_verses(): 'an, quod ubique, tuum est? tua sunt Heliconia Tempe?', ] test_infile = os.path.join(TEST_DIR, 'verses.txt') verses = allzweckmesser.scan.get_verses(test_infile) assert verses == correct plain_verses = allzweckmesser.scan.get_plain_verses(test_infile) assert plain_verses == correct