Add dev module for testing our tool (3656cdf7) · Commits · Messerschleifer / Allzweckmesser

allzweckmesser/corpus.py

+6 −5

Original line number	Diff line number	Diff line
		@@ -101,7 +101,8 @@ def separate_punctuation(tokens):
		token.syllables[0].text = token.syllables[0].text[len(pre):]
		token.syllables[0].span[0] = span_begin
		if post:
		token.syllables[-1].text = token.syllables[-1].text[:-len(post)]
		token.syllables[-1].text = (token.syllables[-1].
		text[:-len(post)])
		token.syllables[-1].span[1] = span_end

		else:

allzweckmesser/dev.py

0 → 100644

+88 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python3
		# -- coding: utf-8 --

		import argparse
		import json
		import random
		from typing import List

		from unidecode import unidecode

		from .style import mark_correct
		from .model import Verse
		from .scanner import Scanner


		def dev(reference_verses, number=10, randomize=False) -> List[Verse]:
		"""Scan verses and compare them with their correct reference version."""
		scanner = Scanner()
		all_analyses = []
		correct = 0

		if randomize:
		sample = random.sample(reference_verses, number)
		else:
		sample = reference_verses[:number]

		for i, ref in enumerate(sample):
		if i != 0:
		print()
		ref_reading = ref.readings[0]

		analysis = scanner.scan_verses([unidecode(ref.text)])[0]
		all_analyses.append(analysis)

		correct_schema = ref_reading.get_schema()
		analysis_correctnesses = [r.get_schema() == correct_schema
		for r in analysis.readings]
		this_correct = any(analysis_correctnesses)

		if this_correct:
		correct += 1
		print('{ref} ({n} readings)'
		.format(ref=mark_correct(ref_reading),
		n=len(analysis.readings)))
		else:
		print('{ref} ({n} readings)'
		.format(ref=ref_reading, n=len(analysis.readings)))
		for reading in analysis.readings:
		print(' {}'.format(reading.format_differences(ref_reading)))

		print('\nCorrect: {}/{} ({:.2f})'
		.format(correct, len(sample),
		correct / len(sample)))
		return all_analyses


		def parse_args() -> argparse.Namespace:
		"""Parse arguments from the commandline.

		:return: An argparse Namespace holding the arguments.
		"""
		d = 'Identify errors in verse parsing.'
		parser = argparse.ArgumentParser(prog='allzweckmesser', description=d)
		parser.add_argument('--infile', help=('A JSON file containing verses'
		' with one reading each.'))
		parser.add_argument('--number', '-n', default=10, type=int,
		help='Number of verses to analyze')
		parser.add_argument('--randomize', '-r', default=False,
		action='store_true',
		help=('Randomize what verses are analyzed. If this is'
		' not set, the first {number} verses are'
		' analyzed.'))
		args = parser.parse_args()
		return args


		def main():
		"""Parse CLI arguments then read and scan verses."""
		args = vars(parse_args())
		args['reference_verses'] = [Verse.from_json(verse)
		for verse
		in json.load(open(args['infile']))]
		del args['infile']
		verse_analyses = dev(**args)


		if __name__ == '__main__':
		main()

allzweckmesser/model.py

+36 −9

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3
		# -- coding: utf-8 --

		import itertools
		import json
		import os
		import re
		from typing import Dict, List, Set

		from .style import mark_long, mark_wrong_length, mark_wrong_syllables


		def check_format(json_file, check_for=dict):

		@@ -80,8 +83,8 @@ class Syllable:
		idx = raw['id']
		span = raw['span']
		text = raw['syllable']
		syllable_length = raw['syllable_length']
		vowel_length = raw['vowel_length']
		syllable_length = raw.get('syllable_length')
		vowel_length = raw.get('vowel_length')
		syllable = cls(text, span, idx, syllable_length, vowel_length)

		if 'phenomena' in raw:
		@@ -117,10 +120,7 @@ class Syllable:
		).format(s=self)

		def __str__(self):
		return (
		'Syllable(text={s.text!r}, syllable_length={s.syllable_length},'
		' vowel_length={s.vowel_length})'
		).format(s=self)
		return self.text


		class Phenomenon:
		@@ -423,6 +423,32 @@ class Reading:
		]
		return ' '.join(forms)

		def format_differences(self, reference, mark_long=mark_long,
		mark_wrong_length=mark_wrong_length,
		mark_wrong_syllables=mark_wrong_syllables,
		syllable_joiner='-', token_joiner=' '):
		formatted_tokens = []
		for token, ref_token in zip(self.tokens, reference.tokens):
		formatted_syllables = []
		sylls = token.syllables
		ref_sylls = ref_token.syllables
		if all(syll and ref_syll # TODO: and syll == ref_syll
		for syll, ref_syll
		in itertools.zip_longest(sylls, ref_sylls)):
		for syll, ref_syll in itertools.zip_longest(sylls, ref_sylls):
		fsyll = (mark_long(syll)
		if syll.syllable_length == 2
		else syll.text)
		if syll.syllable_length != ref_syll.syllable_length:
		fsyll = mark_wrong_length(fsyll)
		formatted_syllables.append(fsyll)
		else:
		formatted_syllables = [mark_wrong_syllables(syll)
		for syll in sylls]
		formatted_tokens.append(syllable_joiner.join(formatted_syllables))
		formatted = token_joiner.join(formatted_tokens)
		return formatted


		class Verse:

		@@ -445,6 +471,7 @@ class Verse:

		text = raw['verse']
		source = dict()
		if 'source' in raw:
		source['author'] = raw['source']['author']
		source['work'] = raw['source']['work']
		source['place'] = raw['source']['place']

allzweckmesser/style.py

0 → 100644

+25 −0

Original line number	Diff line number	Diff line
		# -- coding: utf-8 --

		from colorama import init, Back, Fore, Style

		init()


		def mark_long(text):
		return ('{Style.BRIGHT}{text}{Style.NORMAL}'
		.format(Style=Style, text=text))


		def mark_wrong_length(text):
		return ('{Fore.RED}{text}{Fore.RESET}'
		.format(Fore=Fore, text=text))


		def mark_wrong_syllables(text):
		return ('{Back.RED}{text}{Back.RESET}'
		.format(Back=Back, text=text))


		def mark_correct(text):
		return ('{Fore.GREEN}{text}{Fore.RESET}'
		.format(Fore=Fore, text=text))

requirements.txt

+2 −0

Original line number	Diff line number	Diff line
		beautifulsoup4==4.6.3
		sqlalchemy==1.2.11
		lxml==4.2.5
		colorama==0.3.9
		unidecode==1.0.22