Commit 4110eaad authored by Johan Winge's avatar Johan Winge
Browse files

Improve u-to-v conversion.

parent 4b2cd440
Loading
Loading
Loading
Loading
+28 −16
Original line number Original line Diff line number Diff line
@@ -144,30 +144,38 @@ class Wordlist():
                NLparts = NL.split()
                NLparts = NL.split()
                if len(NLparts) > 0:
                if len(NLparts) > 0:
                    parses += postags.Morpheus2Parses(wordform,NL)
                    parses += postags.Morpheus2Parses(wordform,NL)
            allaccenteds = set()
            lemmatagtoaccenteds = {}
            filteredparses = []
            for parse in parses:
            for parse in parses:
                lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")
                lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")
                parse[postags.LEMMA] = lemma
                parse[postags.LEMMA] = lemma
                accented = parse[postags.ACCENTEDFORM]
                accented = parse[postags.ACCENTEDFORM]
                if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus
                if parse[postags.LEMMA].startswith("trans-") and accented[3] != "_": # Work around shortcoming in Morpheus
                    accented = accented[:3] + "_" + accented[3:]
                    accented = accented[:3] + "_" + accented[3:]
                if accented == "male_":
                if accented == "male_" or accented == "cave_":
                    accented = "male"
                    accented = accented[:-1]
                parse[postags.ACCENTEDFORM] = accented
                parse[postags.ACCENTEDFORM] = accented
                # Remove highly unlikely alternatives:
                # Remove highly unlikely alternatives:
                if accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and not (accented.startswith("vi_ct") and lemma == "vivo") and lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"]:
                if ( accented not in ["me_nse_", "fabuli_s", "vi_ri_", "vi_ro_", "vi_rum", "vi_ro_rum", "vi_ri_s", "vi_ro_s"] and
                    allaccenteds.add(accented.lower())
                     not (accented.startswith("vi_ct") and lemma == "vivo") and
                    filteredparses.append(parse)
                     not (accented.startswith("ori_") and lemma == "orior") and
            if len(allaccenteds) > 1:
                     not (accented.startswith("mori_") and lemma == "morior") and
                knownwords.add(wordform);
                     not (accented.startswith("conci_") and lemma == "concitus") and
                for parse in filteredparses:
                     lemma not in ["pareas","de_-escendo", "de_-eo", "de_-edo", "Nus", "progredio"] ):
                    lemma = parse[postags.LEMMA]
                    accented = parse[postags.ACCENTEDFORM]
                    tag = postags.Parse2LDT(parse)
                    tag = postags.Parse2LDT(parse)
                    lemmatagtoaccenteds[(lemma,tag)] = lemmatagtoaccenteds.get((lemma,tag),[]) + [accented]
            if len(lemmatagtoaccenteds) == 0:
                continue
            knownwords.add(wordform);
            allaccenteds = set()
            for (lemma, tag), accenteds in lemmatagtoaccenteds.items():
                # Sometimes there are several different accented forms; prefer 'volvit' to 'voluit', 'Ju_lius' to 'Iu_lius' etc.
                bestaccented = sorted(accenteds, key = lambda x: x.count('v')+x.count('j')+x.count('J'))[-1]
                lemmatagtoaccenteds[(lemma, tag)] = bestaccented
                allaccenteds.add(bestaccented.lower())
            if len(allaccenteds) > 1:
                for (lemma, tag), accented in lemmatagtoaccenteds.items():
                    self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented))
                    self.dbcursor.execute("INSERT INTO morpheus (wordform, morphtag, lemma, accented) VALUES (%s,%s,%s,%s)", (wordform, tag, lemma, accented))
            elif len(allaccenteds) == 1:
            elif len(allaccenteds) == 1:
                knownwords.add(wordform);
                accented = allaccenteds.pop()
                accented = allaccenteds.pop()
                self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented))
                self.dbcursor.execute("INSERT INTO morpheus (wordform, accented) VALUES (%s,%s)", (wordform, accented))
        ## The remaining were unknown to Morpheus:
        ## The remaining were unknown to Morpheus:
@@ -340,7 +348,7 @@ class Tokenization:
        for oldtoken in self.tokens:
        for oldtoken in self.tokens:
            tobeadded = []
            tobeadded = []
            oldlc = oldtoken.token.lower()
            oldlc = oldtoken.token.lower()
            if oldtoken.isword and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]):
            if oldtoken.isword and oldlc != "que" and (oldlc in wordlist.unknownwords or oldlc in ["nec","neque","necnon","seque","seseque","quique","secumque"]):
                if oldlc == "nec":
                if oldlc == "nec":
                    tobeadded = oldtoken.split(1,True)
                    tobeadded = oldtoken.split(1,True)
                elif oldlc == "necnon":
                elif oldlc == "necnon":
@@ -710,6 +718,7 @@ else: # Run as a free-standing Python script
            print "  --test         Mark vowels in a short example text."
            print "  --test         Mark vowels in a short example text."
            print "  --initialize   Reset the database (only necessary once)."
            print "  --initialize   Reset the database (only necessary once)."
            print "  -h  or --help  Show this information."
            print "  -h  or --help  Show this information."
            exit(0)
        elif arg == "--initialize":
        elif arg == "--initialize":
            wordlist = Wordlist()
            wordlist = Wordlist()
            wordlist.reinitializedatabase()
            wordlist.reinitializedatabase()
@@ -728,6 +737,9 @@ else: # Run as a free-standing Python script
            outfilename = iterator.next()
            outfilename = iterator.next()
        elif arg == "--test":
        elif arg == "--test":
            dotest = True
            dotest = True
        else:
            print "Unknown argument:", arg
            exit(1)
    #endfor
    #endfor
    if dotest:
    if dotest:
        texttomacronize = "O orbis terrarum te saluto!\n"
        texttomacronize = "O orbis terrarum te saluto!\n"