Add function for getting syllables (42d41352) · Commits · Messerschleifer / Allzweckmesser

allzweckmesser/scanner.py

+71 −1

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@ import re
		from typing import Dict, List, Set, Tuple

		from .db import FormAnalysis
		from .model import Reading, Token, Verse
		from .model import Reading, Syllable, Token, Verse
		from .wordlist import WordList

		verses = [
		@@ -172,6 +172,73 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
		return readings


		def get_syllables_for_token(token: Token):
		syllables = []
		if token.accented:
		regex = (
		r'((?:ua\|ue\|ae\|oe\|au\|eu\|yi\|[aeiouy])[_^]?)'
		if token.text[0].isupper()
		else r'((?:ua\|ue\|ae\|oe\|au\|[aeiouy])[_^]?)'
		)
		accented = (token.accented + token.clitic
		if token.clitic
		else token.accented)
		chunks = [
		chunk
		for chunk in re.split(regex, accented, flags=re.IGNORECASE)
		if chunk
		]
		syll_start = token.span[0]
		syll_text = ''
		syll_vowel_length = 1
		syll_has_vowel = False
		for i, c in enumerate(chunks):
		if c[0] in 'aeiouy':
		if syll_has_vowel:
		# Syllable already has a vowel.
		# Add the current syllable and begin a new one.
		syll = Syllable(syllable=syll_text,
		span=[syll_start,
		syll_start + len(syll_text)],
		idx=None,
		vowel_length=syll_vowel_length,
		syllable_length=syll_vowel_length)
		syllables.append(syll)

		# Begin info for new syllable.
		syll_text = c.rstrip('_^')
		syll_start = syll_start + len(syll_text) + 1
		else:
		# Syllable has no vowel yet.
		syll_text += c.rstrip('_^')
		syll_has_vowel = True
		syll_vowel_length = (
		2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
		)
		else:
		syll_text += c.rstrip('_^')

		if syll_text:
		# Add the last syllable.
		syll = Syllable(syllable=syll_text,
		span=[syll_start, syll_start + len(syll_text)],
		idx=None,
		vowel_length=syll_vowel_length,
		syllable_length=syll_vowel_length)
		syllables.append(syll)
		else:
		syllables = None
		return syllables


		def get_syllables(reading):
		for token in reading.tokens:
		token.syllables = get_syllables_for_token(token)
		# TODO: Add positional_lengthening phenomena and adjust syllable
		# lengths accordingly.
		return reading


		class Scanner:

		def __init__(self):
		@@ -183,4 +250,7 @@ class Scanner:
		Verse(verse=v, readings=lemmatize(self.word_list, br))
		for v, br in zip(plain_verses, base_readings)
		]
		for verse in verses:
		for reading in verse.readings:
		get_syllables(reading)
		return verses