Commit 42d41352 authored by Simon Will's avatar Simon Will
Browse files

Add function for getting syllables

parent 9c62bad7
Loading
Loading
Loading
Loading
+71 −1
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@ import re
from typing import Dict, List, Set, Tuple

from .db import FormAnalysis
from .model import Reading, Token, Verse
from .model import Reading, Syllable, Token, Verse
from .wordlist import WordList

verses = [
@@ -172,6 +172,73 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
    return readings


def get_syllables_for_token(token: Token):
    syllables = []
    if token.accented:
        regex = (
            r'((?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
            if token.text[0].isupper()
            else r'((?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
        )
        accented = (token.accented + token.clitic
                    if token.clitic
                    else token.accented)
        chunks = [
            chunk
            for chunk in re.split(regex, accented, flags=re.IGNORECASE)
            if chunk
        ]
        syll_start = token.span[0]
        syll_text = ''
        syll_vowel_length = 1
        syll_has_vowel = False
        for i, c in enumerate(chunks):
            if c[0] in 'aeiouy':
                if syll_has_vowel:
                    # Syllable already has a vowel.
                    # Add the current syllable and begin a new one.
                    syll = Syllable(syllable=syll_text,
                                    span=[syll_start,
                                          syll_start + len(syll_text)],
                                    idx=None,
                                    vowel_length=syll_vowel_length,
                                    syllable_length=syll_vowel_length)
                    syllables.append(syll)

                    # Begin info for new syllable.
                    syll_text = c.rstrip('_^')
                    syll_start = syll_start + len(syll_text) + 1
                else:
                    # Syllable has no vowel yet.
                    syll_text += c.rstrip('_^')
                syll_has_vowel = True
                syll_vowel_length = (
                    2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
                )
            else:
                syll_text += c.rstrip('_^')

        if syll_text:
            # Add the last syllable.
            syll = Syllable(syllable=syll_text,
                            span=[syll_start, syll_start + len(syll_text)],
                            idx=None,
                            vowel_length=syll_vowel_length,
                            syllable_length=syll_vowel_length)
            syllables.append(syll)
    else:
        syllables = None
    return syllables


def get_syllables(reading):
    for token in reading.tokens:
        token.syllables = get_syllables_for_token(token)
    # TODO: Add positional_lengthening phenomena and adjust syllable
    # lengths accordingly.
    return reading


class Scanner:

    def __init__(self):
@@ -183,4 +250,7 @@ class Scanner:
            Verse(verse=v, readings=lemmatize(self.word_list, br))
            for v, br in zip(plain_verses, base_readings)
        ]
        for verse in verses:
            for reading in verse.readings:
                get_syllables(reading)
        return verses