Improve u-to-v conversion. (4110eaad) · Commits · Messerschleifer / latin-macronizer

macronizer.py

+28 −16

Original line number	Original line	Diff line number	Diff line
	@@ -144,30 +144,38 @@ class Wordlist():
	NLparts = NL.split()		NLparts = NL.split()
	if len(NLparts) > 0:		if len(NLparts) > 0:
	parses += postags.Morpheus2Parses(wordform,NL)		parses += postags.Morpheus2Parses(wordform,NL)
	allaccenteds = set()		lemmatagtoaccenteds = {}
	filteredparses = []
	for parse in parses:		for parse in parses:
	lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")		lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")
	parse[postags.LEMMA] = lemma		parse[postags.LEMMA] = lemma
	accented = parse[postags.ACCENTEDFORM]		accented = parse[postags.ACCENTEDFORM]
	if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus		if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus
	accented = accented[:3] + "_" + accented[3:]		accented = accented[:3] + "_" + accented[3:]
	if accented == "male_":		if accented == "male_" or accented == "cave_":
	accented = "male"		accented = accented[:-1]
	parse[postags.ACCENTEDFORM] = accented		parse[postags.ACCENTEDFORM] = accented
	# Remove highly unlikely alternatives:		# Remove highly unlikely alternatives:
	if accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and not (accented.startswith("vi_ct") and lemma == "vivo") and lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"]:		if ( accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and
	allaccenteds.add(accented.lower())		not (accented.startswith("vi_ct") and lemma == "vivo") and
	filteredparses.append(parse)		not (accented.startswith("ori_") and lemma == "orior") and
	if len(allaccenteds) > 1:		not (accented.startswith("mori_") and lemma == "morior") and
	knownwords.add(wordform);		not (accented.startswith("conci_") and lemma == "concitus") and
	for parse in filteredparses:		lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"] ):
	lemma = parse[postags.LEMMA]
	accented = parse[postags.ACCENTEDFORM]
	tag = postags.Parse2LDT(parse)		tag = postags.Parse2LDT(parse)
			lemmatagtoaccenteds[(lemma,tag)] = lemmatagtoaccenteds.get((lemma,tag),[]) + [accented]
			if len(lemmatagtoaccenteds) == 0:
			continue
			knownwords.add(wordform);
			allaccenteds = set()
			for (lemma, tag), accenteds in lemmatagtoaccenteds.items():
			# Sometimes there are several different accented forms; prefer 'volvit' to 'voluit', 'Ju_lius' to 'Iu_lius' etc.
			bestaccented = sorted(accenteds, key = lambda x: x.count('v')+x.count('j')+x.count('J'))[-1]
			lemmatagtoaccenteds[(lemma, tag)] = bestaccented
			allaccenteds.add(bestaccented.lower())
			if len(allaccenteds) > 1:
			for (lemma, tag), accented in lemmatagtoaccenteds.items():
	self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented))		self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented))
	elif len(allaccenteds) == 1:		elif len(allaccenteds) == 1:
	knownwords.add(wordform);
	accented = allaccenteds.pop()		accented = allaccenteds.pop()
	self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented))		self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented))
	## The remaining were unknown to Morpheus:		## The remaining were unknown to Morpheus:
	@@ -340,7 +348,7 @@ class Tokenization:
	for oldtoken in self.tokens:		for oldtoken in self.tokens:
	tobeadded = []		tobeadded = []
	oldlc = oldtoken.token.lower()		oldlc = oldtoken.token.lower()
	if oldtoken.isword and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]):		if oldtoken.isword and oldlc != "que" and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]):
	if oldlc == "nec":		if oldlc == "nec":
	tobeadded = oldtoken.split(1,True)		tobeadded = oldtoken.split(1,True)
	elif oldlc == "necnon":		elif oldlc == "necnon":
	@@ -710,6 +718,7 @@ else: # Run as a free-standing Python script
	print " --test Mark vowels in a short example text."		print " --test Mark vowels in a short example text."
	print " --initialize Reset the database (only necessary once)."		print " --initialize Reset the database (only necessary once)."
	print " -h or --help Show this information."		print " -h or --help Show this information."
			exit(0)
	elif arg == "--initialize":		elif arg == "--initialize":
	wordlist = Wordlist()		wordlist = Wordlist()
	wordlist.reinitializedatabase()		wordlist.reinitializedatabase()
	@@ -728,6 +737,9 @@ else: # Run as a free-standing Python script
	outfilename = iterator.next()		outfilename = iterator.next()
	elif arg == "--test":		elif arg == "--test":
	dotest = True		dotest = True
			else:
			print "Unknown argument:", arg
			exit(1)
	#endfor		#endfor
	if dotest:		if dotest:
	texttomacronize = "O orbis terrarum te saluto!\n"		texttomacronize = "O orbis terrarum te saluto!\n"