Loading macronizer.py +28 −16 Original line number Original line Diff line number Diff line Loading @@ -144,30 +144,38 @@ class Wordlist(): NLparts = NL.split() NLparts = NL.split() if len(NLparts) > 0: if len(NLparts) > 0: parses += postags.Morpheus2Parses(wordform,NL) parses += postags.Morpheus2Parses(wordform,NL) allaccenteds = set() lemmatagtoaccenteds = {} filteredparses = [] for parse in parses: for parse in parses: lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+") lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+") parse[postags.LEMMA] = lemma parse[postags.LEMMA] = lemma accented = parse[postags.ACCENTEDFORM] accented = parse[postags.ACCENTEDFORM] if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus accented = accented[:3] + "_" + accented[3:] accented = accented[:3] + "_" + accented[3:] if accented == "male_": if accented == "male_" or accented == "cave_": accented = "male" accented = accented[:-1] parse[postags.ACCENTEDFORM] = accented parse[postags.ACCENTEDFORM] = accented # Remove highly unlikely alternatives: # Remove highly unlikely alternatives: if accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and not (accented.startswith("vi_ct") and lemma == "vivo") and lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"]: if ( accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and allaccenteds.add(accented.lower()) not (accented.startswith("vi_ct") and lemma == "vivo") and filteredparses.append(parse) not (accented.startswith("ori_") and lemma == "orior") and if len(allaccenteds) > 1: not (accented.startswith("mori_") and lemma == "morior") and knownwords.add(wordform); not (accented.startswith("conci_") and lemma == "concitus") and for parse in filteredparses: lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"] ): lemma = parse[postags.LEMMA] accented = parse[postags.ACCENTEDFORM] tag = postags.Parse2LDT(parse) tag = postags.Parse2LDT(parse) lemmatagtoaccenteds[(lemma,tag)] = lemmatagtoaccenteds.get((lemma,tag),[]) + [accented] if len(lemmatagtoaccenteds) == 0: continue knownwords.add(wordform); allaccenteds = set() for (lemma, tag), accenteds in lemmatagtoaccenteds.items(): # Sometimes there are several different accented forms; prefer 'volvit' to 'voluit', 'Ju_lius' to 'Iu_lius' etc. bestaccented = sorted(accenteds, key = lambda x: x.count('v')+x.count('j')+x.count('J'))[-1] lemmatagtoaccenteds[(lemma, tag)] = bestaccented allaccenteds.add(bestaccented.lower()) if len(allaccenteds) > 1: for (lemma, tag), accented in lemmatagtoaccenteds.items(): self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented)) self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented)) elif len(allaccenteds) == 1: elif len(allaccenteds) == 1: knownwords.add(wordform); accented = allaccenteds.pop() accented = allaccenteds.pop() self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented)) self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented)) ## The remaining were unknown to Morpheus: ## The remaining were unknown to Morpheus: Loading Loading @@ -340,7 +348,7 @@ class Tokenization: for oldtoken in self.tokens: for oldtoken in self.tokens: tobeadded = [] tobeadded = [] oldlc = oldtoken.token.lower() oldlc = oldtoken.token.lower() if oldtoken.isword and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]): if oldtoken.isword and oldlc != "que" and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]): if oldlc == "nec": if oldlc == "nec": tobeadded = oldtoken.split(1,True) tobeadded = oldtoken.split(1,True) elif oldlc == "necnon": elif oldlc == "necnon": Loading Loading @@ -710,6 +718,7 @@ else: # Run as a free-standing Python script print " --test Mark vowels in a short example text." print " --test Mark vowels in a short example text." print " --initialize Reset the database (only necessary once)." print " --initialize Reset the database (only necessary once)." print " -h or --help Show this information." print " -h or --help Show this information." exit(0) elif arg == "--initialize": elif arg == "--initialize": wordlist = Wordlist() wordlist = Wordlist() wordlist.reinitializedatabase() wordlist.reinitializedatabase() Loading @@ -728,6 +737,9 @@ else: # Run as a free-standing Python script outfilename = iterator.next() outfilename = iterator.next() elif arg == "--test": elif arg == "--test": dotest = True dotest = True else: print "Unknown argument:", arg exit(1) #endfor #endfor if dotest: if dotest: texttomacronize = "O orbis terrarum te saluto!\n" texttomacronize = "O orbis terrarum te saluto!\n" Loading Loading
macronizer.py +28 −16 Original line number Original line Diff line number Diff line Loading @@ -144,30 +144,38 @@ class Wordlist(): NLparts = NL.split() NLparts = NL.split() if len(NLparts) > 0: if len(NLparts) > 0: parses += postags.Morpheus2Parses(wordform,NL) parses += postags.Morpheus2Parses(wordform,NL) allaccenteds = set() lemmatagtoaccenteds = {} filteredparses = [] for parse in parses: for parse in parses: lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+") lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+") parse[postags.LEMMA] = lemma parse[postags.LEMMA] = lemma accented = parse[postags.ACCENTEDFORM] accented = parse[postags.ACCENTEDFORM] if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus accented = accented[:3] + "_" + accented[3:] accented = accented[:3] + "_" + accented[3:] if accented == "male_": if accented == "male_" or accented == "cave_": accented = "male" accented = accented[:-1] parse[postags.ACCENTEDFORM] = accented parse[postags.ACCENTEDFORM] = accented # Remove highly unlikely alternatives: # Remove highly unlikely alternatives: if accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and not (accented.startswith("vi_ct") and lemma == "vivo") and lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"]: if ( accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and allaccenteds.add(accented.lower()) not (accented.startswith("vi_ct") and lemma == "vivo") and filteredparses.append(parse) not (accented.startswith("ori_") and lemma == "orior") and if len(allaccenteds) > 1: not (accented.startswith("mori_") and lemma == "morior") and knownwords.add(wordform); not (accented.startswith("conci_") and lemma == "concitus") and for parse in filteredparses: lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"] ): lemma = parse[postags.LEMMA] accented = parse[postags.ACCENTEDFORM] tag = postags.Parse2LDT(parse) tag = postags.Parse2LDT(parse) lemmatagtoaccenteds[(lemma,tag)] = lemmatagtoaccenteds.get((lemma,tag),[]) + [accented] if len(lemmatagtoaccenteds) == 0: continue knownwords.add(wordform); allaccenteds = set() for (lemma, tag), accenteds in lemmatagtoaccenteds.items(): # Sometimes there are several different accented forms; prefer 'volvit' to 'voluit', 'Ju_lius' to 'Iu_lius' etc. bestaccented = sorted(accenteds, key = lambda x: x.count('v')+x.count('j')+x.count('J'))[-1] lemmatagtoaccenteds[(lemma, tag)] = bestaccented allaccenteds.add(bestaccented.lower()) if len(allaccenteds) > 1: for (lemma, tag), accented in lemmatagtoaccenteds.items(): self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented)) self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented)) elif len(allaccenteds) == 1: elif len(allaccenteds) == 1: knownwords.add(wordform); accented = allaccenteds.pop() accented = allaccenteds.pop() self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented)) self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented)) ## The remaining were unknown to Morpheus: ## The remaining were unknown to Morpheus: Loading Loading @@ -340,7 +348,7 @@ class Tokenization: for oldtoken in self.tokens: for oldtoken in self.tokens: tobeadded = [] tobeadded = [] oldlc = oldtoken.token.lower() oldlc = oldtoken.token.lower() if oldtoken.isword and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]): if oldtoken.isword and oldlc != "que" and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]): if oldlc == "nec": if oldlc == "nec": tobeadded = oldtoken.split(1,True) tobeadded = oldtoken.split(1,True) elif oldlc == "necnon": elif oldlc == "necnon": Loading Loading @@ -710,6 +718,7 @@ else: # Run as a free-standing Python script print " --test Mark vowels in a short example text." print " --test Mark vowels in a short example text." print " --initialize Reset the database (only necessary once)." print " --initialize Reset the database (only necessary once)." print " -h or --help Show this information." print " -h or --help Show this information." exit(0) elif arg == "--initialize": elif arg == "--initialize": wordlist = Wordlist() wordlist = Wordlist() wordlist.reinitializedatabase() wordlist.reinitializedatabase() Loading @@ -728,6 +737,9 @@ else: # Run as a free-standing Python script outfilename = iterator.next() outfilename = iterator.next() elif arg == "--test": elif arg == "--test": dotest = True dotest = True else: print "Unknown argument:", arg exit(1) #endfor #endfor if dotest: if dotest: texttomacronize = "O orbis terrarum te saluto!\n" texttomacronize = "O orbis terrarum te saluto!\n" Loading