Loading allzweckmesser/scanner.py +71 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ import re from typing import Dict, List, Set, Tuple from .db import FormAnalysis from .model import Reading, Token, Verse from .model import Reading, Syllable, Token, Verse from .wordlist import WordList verses = [ Loading Loading @@ -172,6 +172,73 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: return readings def get_syllables_for_token(token: Token): syllables = [] if token.accented: regex = ( r'((?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)' if token.text[0].isupper() else r'((?:ua|ue|ae|oe|au|[aeiouy])[_^]?)' ) accented = (token.accented + token.clitic if token.clitic else token.accented) chunks = [ chunk for chunk in re.split(regex, accented, flags=re.IGNORECASE) if chunk ] syll_start = token.span[0] syll_text = '' syll_vowel_length = 1 syll_has_vowel = False for i, c in enumerate(chunks): if c[0] in 'aeiouy': if syll_has_vowel: # Syllable already has a vowel. # Add the current syllable and begin a new one. syll = Syllable(syllable=syll_text, span=[syll_start, syll_start + len(syll_text)], idx=None, vowel_length=syll_vowel_length, syllable_length=syll_vowel_length) syllables.append(syll) # Begin info for new syllable. syll_text = c.rstrip('_^') syll_start = syll_start + len(syll_text) + 1 else: # Syllable has no vowel yet. syll_text += c.rstrip('_^') syll_has_vowel = True syll_vowel_length = ( 2 if len(c) > 1 and c[1] in 'aeiouy_' else 1 ) else: syll_text += c.rstrip('_^') if syll_text: # Add the last syllable. syll = Syllable(syllable=syll_text, span=[syll_start, syll_start + len(syll_text)], idx=None, vowel_length=syll_vowel_length, syllable_length=syll_vowel_length) syllables.append(syll) else: syllables = None return syllables def get_syllables(reading): for token in reading.tokens: token.syllables = get_syllables_for_token(token) # TODO: Add positional_lengthening phenomena and adjust syllable # lengths accordingly. return reading class Scanner: def __init__(self): Loading @@ -183,4 +250,7 @@ class Scanner: Verse(verse=v, readings=lemmatize(self.word_list, br)) for v, br in zip(plain_verses, base_readings) ] for verse in verses: for reading in verse.readings: get_syllables(reading) return verses Loading
allzweckmesser/scanner.py +71 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ import re from typing import Dict, List, Set, Tuple from .db import FormAnalysis from .model import Reading, Token, Verse from .model import Reading, Syllable, Token, Verse from .wordlist import WordList verses = [ Loading Loading @@ -172,6 +172,73 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: return readings def get_syllables_for_token(token: Token): syllables = [] if token.accented: regex = ( r'((?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)' if token.text[0].isupper() else r'((?:ua|ue|ae|oe|au|[aeiouy])[_^]?)' ) accented = (token.accented + token.clitic if token.clitic else token.accented) chunks = [ chunk for chunk in re.split(regex, accented, flags=re.IGNORECASE) if chunk ] syll_start = token.span[0] syll_text = '' syll_vowel_length = 1 syll_has_vowel = False for i, c in enumerate(chunks): if c[0] in 'aeiouy': if syll_has_vowel: # Syllable already has a vowel. # Add the current syllable and begin a new one. syll = Syllable(syllable=syll_text, span=[syll_start, syll_start + len(syll_text)], idx=None, vowel_length=syll_vowel_length, syllable_length=syll_vowel_length) syllables.append(syll) # Begin info for new syllable. syll_text = c.rstrip('_^') syll_start = syll_start + len(syll_text) + 1 else: # Syllable has no vowel yet. syll_text += c.rstrip('_^') syll_has_vowel = True syll_vowel_length = ( 2 if len(c) > 1 and c[1] in 'aeiouy_' else 1 ) else: syll_text += c.rstrip('_^') if syll_text: # Add the last syllable. syll = Syllable(syllable=syll_text, span=[syll_start, syll_start + len(syll_text)], idx=None, vowel_length=syll_vowel_length, syllable_length=syll_vowel_length) syllables.append(syll) else: syllables = None return syllables def get_syllables(reading): for token in reading.tokens: token.syllables = get_syllables_for_token(token) # TODO: Add positional_lengthening phenomena and adjust syllable # lengths accordingly. return reading class Scanner: def __init__(self): Loading @@ -183,4 +250,7 @@ class Scanner: Verse(verse=v, readings=lemmatize(self.word_list, br)) for v, br in zip(plain_verses, base_readings) ] for verse in verses: for reading in verse.readings: get_syllables(reading) return verses