Commit a2cebbe9 authored by Johan Winge's avatar Johan Winge
Browse files

First public release.

parent b256c4e5
Loading
Loading
Loading
Loading

INSTALL.txt

0 → 100644
+71 −0
Original line number Diff line number Diff line
These instructions are for Linux (specifically of the Debian flavour). Please adapt
to your system, if necessary.

Supposedly (since you are reading this) you have already downloaded the source code
for the macronizer and are standing in the directory called "latin-macronizer".

First download the specially adapted version of Morpheus, and compile it:

  git clone https://github.com/Alatius/morpheus.git
  cd morpheus/src
  make
  make install
  cd ..
  ./update.sh
  ./update.sh
  echo "salve" | MORPHLIB=stemlib bin/cruncher -L
  cd ..

(Yes, run the update script twice. And check that it did in fact parse "salve" correctly!)

Now download and compile RFTagger:

  wget -qO- http://www.cis.uni-muenchen.de/~schmid/tools/RFTagger/data/RFTagger.tar.gz | tar xvz
  cd RFTagger/src
  make
  sudo make install
  cd ../..

And the patched version of LDT:

  git clone https://github.com/Alatius/treebank_data.git

Convert the corpus and train RFTagger:

  ./train-rftagger.sh

The macronizer script stores the Morpheus analyses in a PostgreSQL database.
So you will have to install a database server and create a database.
The following works for me; your mileage may vary. Change theusername and
thepassword to whatever you want.

  sudo apt-get install postgresql
  sudo -u postgres psql
  postgres=# create user theusername password 'thepassword';
  postgres=# create database macronizer encoding 'UTF8' owner theusername;
  postgres=# \q

Install the psycopg2 Python module:

  sudo apt-get install python-psycopg2

Now, edit the main script macronizer.py and set the constants in the beginning to the correct values:

  MACRONIZERLIB = '/path/to/the/latin-macronizer/'
  MORPHEUSDIR = MACRONIZERLIB+'morpheus/'
  RFTAGGERDIR = '/usr/local/bin/'
  DBNAME = 'macronizer'
  DBUSER = 'theusername'
  DBPASSWORD = 'thepassword'
  DBHOST = 'localhost'

Finally, initialize the macronizer:

  python macronizer.py --initialize

You can now test it with the following command

  python macronizer.py --test

Good luck! In case of problems, contact me at johan.winge@gmail.com and I will do my best to assist you.

corpus2train.py

0 → 100755
+50 −0
Original line number Diff line number Diff line
#!/usr/bin/python

import xml.etree.ElementTree as ET
import codecs
import re

corpuspath = "treebank_data/v1.6/latin/data/"
treebankfile = codecs.open("ldt-corpus.txt","w","utf8")
vocabularyfile = codecs.open("ldt-vocabulary.txt","w","utf8")
vocabulary = set()

xsegment = ""
xsegmentbehind = ""
for f in ["1999.02.0010",
          "2008.01.0002",
          "2007.01.0001",
          "1999.02.0060",
          "phi0448.phi001.perseus-lat1",
          "phi0620.phi001.perseus-lat1",
          "phi0959.phi006.perseus-lat1",
          "phi0690.phi003.perseus-lat1"]:
    bank = ET.parse(corpuspath+f+".tb.xml")
    for sentence in bank.getroot():
        for token in sentence.findall('word'):
            idnum = token.get('id','_')
            head = token.get('head','_')
            relation = token.get('relation','_')
            form = token.get('form','_')
            lemma = token.get('lemma',form)
            postag = token.get('postag','_')
            if form != "|" and postag != "" and postag != "_":
                if lemma == "other" and relation == "XSEG" and int(head) == int(idnum) + 1:
                    xsegment = form
                    continue
                if (lemma == "que1" or lemma == "ne1") and relation == "XSEG" and int(head) == int(idnum) + 1:
                    xsegmentbehind = form
                    continue
                postag = '.'.join(list(postag))
                lemma = lemma.replace("#","").replace("1","").replace(" ","+")
                word = xsegment+form+xsegmentbehind
                treebankfile.write(word+"\t"+postag+"\t"+lemma+"\n")
                vocabulary.add(word)
                xsegment = ""
                xsegmentbehind = ""
        treebankfile.write(".\tu.-.-.-.-.-.-.-.-\tPERIOD1\n")
        treebankfile.write("\n")

for word in vocabulary:
    vocabularyfile.write(word+"\n")

macronizer.py

0 → 100755
+758 −0

File added.

Preview size limit exceeded, changes collapsed.

morpheus2lexicon.py

0 → 100644
+52 −0
Original line number Diff line number Diff line
#!/usr/bin/python
# -*- coding: utf-8 -*-

import postags
import codecs

morpheus = codecs.open("vocabulary-crunched.txt","r","utf8")
lexicon = codecs.open("rftagger-lexicon.txt","w","utf8")

tagtoaccents = {}

for wordform in morpheus:
    wordform = wordform.strip()
    NLs = morpheus.next().strip()
    parses = []
    for NL in NLs.split("<NL>"):
        NL = NL.replace("</NL>","")
        NLparts = NL.split()
        if len(NLparts) > 0:
            parses += postags.Morpheus2Parses(wordform,NL)
    for parse in parses:
        lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")
        accented = parse[postags.ACCENTEDFORM]
        tag = postags.Parse2LDT(parse)
        tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)]
        tag = '.'.join(list(tag))
        lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n')

def escapedaccents(txt):
    for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"),
                                ("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]:
        txt = txt.replace(source,replacement)
    return txt
#enddef
endingsfile = codecs.open("macronized-endings.txt","w","utf8")
for tag in tagtoaccents:
    endingfreqs = {}
    for accented in tagtoaccents[tag]:
        for i in range(1,min(len(accented)-3, 12)):
            ending = accented[-i:]
            endingfreqs[ending] = endingfreqs.get(ending,0) + 1
    endingsfile.write(tag)
    relevantendings = []
    for ending in endingfreqs:
        endingwithoutmacrons = postags.removemacrons(ending)
        if ending[0] != endingwithoutmacrons[0] and endingfreqs[ending] > endingfreqs.get(endingwithoutmacrons, 1):
            relevantendings.append(ending)
    relevantendings.sort(lambda x,y: cmp(len(y), len(x)))
    for ending in relevantendings:
        endingsfile.write('\t' + escapedaccents(ending))
    endingsfile.write('\n')

postags.py

0 → 100644
+737 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading