Commit 3656cdf7 authored by Simon Will's avatar Simon Will
Browse files

Add dev module for testing our tool

parent f944c69a
Loading
Loading
Loading
Loading
+6 −5
Original line number Diff line number Diff line
@@ -101,7 +101,8 @@ def separate_punctuation(tokens):
                token.syllables[0].text = token.syllables[0].text[len(pre):]
                token.syllables[0].span[0] = span_begin
            if post:
                token.syllables[-1].text = token.syllables[-1].text[:-len(post)]
                token.syllables[-1].text = (token.syllables[-1].
                                            text[:-len(post)])
                token.syllables[-1].span[1] = span_end

        else:

allzweckmesser/dev.py

0 → 100644
+88 −0
Original line number Diff line number Diff line
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import argparse
import json
import random
from typing import List

from unidecode import unidecode

from .style import mark_correct
from .model import Verse
from .scanner import Scanner


def dev(reference_verses, number=10, randomize=False) -> List[Verse]:
    """Scan verses and compare them with their correct reference version."""
    scanner = Scanner()
    all_analyses = []
    correct = 0

    if randomize:
        sample = random.sample(reference_verses, number)
    else:
        sample = reference_verses[:number]

    for i, ref in enumerate(sample):
        if i != 0:
            print()
        ref_reading = ref.readings[0]

        analysis = scanner.scan_verses([unidecode(ref.text)])[0]
        all_analyses.append(analysis)

        correct_schema = ref_reading.get_schema()
        analysis_correctnesses = [r.get_schema() == correct_schema
                                  for r in analysis.readings]
        this_correct = any(analysis_correctnesses)

        if this_correct:
            correct += 1
            print('{ref} ({n} readings)'
                  .format(ref=mark_correct(ref_reading),
                          n=len(analysis.readings)))
        else:
            print('{ref} ({n} readings)'
                  .format(ref=ref_reading, n=len(analysis.readings)))
            for reading in analysis.readings:
                print('    {}'.format(reading.format_differences(ref_reading)))

    print('\nCorrect: {}/{} ({:.2f})'
          .format(correct, len(sample),
                  correct / len(sample)))
    return all_analyses


def parse_args() -> argparse.Namespace:
    """Parse arguments from the commandline.

    :return: An argparse Namespace holding the arguments.
    """
    d = 'Identify errors in verse parsing.'
    parser = argparse.ArgumentParser(prog='allzweckmesser', description=d)
    parser.add_argument('--infile', help=('A JSON file containing verses'
                                          ' with one reading each.'))
    parser.add_argument('--number', '-n', default=10, type=int,
                        help='Number of verses to analyze')
    parser.add_argument('--randomize', '-r', default=False,
                        action='store_true',
                        help=('Randomize what verses are analyzed. If this is'
                              ' not set, the first {number} verses are'
                              ' analyzed.'))
    args = parser.parse_args()
    return args


def main():
    """Parse CLI arguments then read and scan verses."""
    args = vars(parse_args())
    args['reference_verses'] = [Verse.from_json(verse)
                                for verse
                                in json.load(open(args['infile']))]
    del args['infile']
    verse_analyses = dev(**args)


if __name__ == '__main__':
    main()
+36 −9
Original line number Diff line number Diff line
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import itertools
import json
import os
import re
from typing import Dict, List, Set

from .style import mark_long, mark_wrong_length, mark_wrong_syllables


def check_format(json_file, check_for=dict):

@@ -80,8 +83,8 @@ class Syllable:
        idx = raw['id']
        span = raw['span']
        text = raw['syllable']
        syllable_length = raw['syllable_length']
        vowel_length = raw['vowel_length']
        syllable_length = raw.get('syllable_length')
        vowel_length = raw.get('vowel_length')
        syllable = cls(text, span, idx, syllable_length, vowel_length)

        if 'phenomena' in raw:
@@ -117,10 +120,7 @@ class Syllable:
        ).format(s=self)

    def __str__(self):
        return (
            'Syllable(text={s.text!r}, syllable_length={s.syllable_length},'
            ' vowel_length={s.vowel_length})'
        ).format(s=self)
        return self.text


class Phenomenon:
@@ -423,6 +423,32 @@ class Reading:
        ]
        return ' '.join(forms)

    def format_differences(self, reference, mark_long=mark_long,
                           mark_wrong_length=mark_wrong_length,
                           mark_wrong_syllables=mark_wrong_syllables,
                           syllable_joiner='-', token_joiner=' '):
        formatted_tokens = []
        for token, ref_token in zip(self.tokens, reference.tokens):
            formatted_syllables = []
            sylls = token.syllables
            ref_sylls = ref_token.syllables
            if all(syll and ref_syll  # TODO: and syll == ref_syll
                   for syll, ref_syll
                   in itertools.zip_longest(sylls, ref_sylls)):
                for syll, ref_syll in itertools.zip_longest(sylls, ref_sylls):
                    fsyll = (mark_long(syll)
                             if syll.syllable_length == 2
                             else syll.text)
                    if syll.syllable_length != ref_syll.syllable_length:
                        fsyll = mark_wrong_length(fsyll)
                    formatted_syllables.append(fsyll)
            else:
                formatted_syllables = [mark_wrong_syllables(syll)
                                       for syll in sylls]
            formatted_tokens.append(syllable_joiner.join(formatted_syllables))
        formatted = token_joiner.join(formatted_tokens)
        return formatted


class Verse:

@@ -445,6 +471,7 @@ class Verse:

        text = raw['verse']
        source = dict()
        if 'source' in raw:
            source['author'] = raw['source']['author']
            source['work'] = raw['source']['work']
            source['place'] = raw['source']['place']
+25 −0
Original line number Diff line number Diff line
# -*- coding: utf-8 -*-

from colorama import init, Back, Fore, Style

init()


def mark_long(text):
    return ('{Style.BRIGHT}{text}{Style.NORMAL}'
            .format(Style=Style, text=text))


def mark_wrong_length(text):
    return ('{Fore.RED}{text}{Fore.RESET}'
            .format(Fore=Fore, text=text))


def mark_wrong_syllables(text):
    return ('{Back.RED}{text}{Back.RESET}'
            .format(Back=Back, text=text))


def mark_correct(text):
    return ('{Fore.GREEN}{text}{Fore.RESET}'
            .format(Fore=Fore, text=text))
+2 −0
Original line number Diff line number Diff line
beautifulsoup4==4.6.3
sqlalchemy==1.2.11
lxml==4.2.5
colorama==0.3.9
unidecode==1.0.22