First public release. (a2cebbe9) · Commits · Messerschleifer / latin-macronizer

INSTALL.txt

0 → 100644

+71 −0

Original line number	Diff line number	Diff line
		These instructions are for Linux (specifically of the Debian flavour). Please adapt
		to your system, if necessary.

		Supposedly (since you are reading this) you have already downloaded the source code
		for the macronizer and are standing in the directory called "latin-macronizer".

		First download the specially adapted version of Morpheus, and compile it:

		git clone https://github.com/Alatius/morpheus.git
		cd morpheus/src
		make
		make install
		cd ..
		./update.sh
		./update.sh
		echo "salve" \| MORPHLIB=stemlib bin/cruncher -L
		cd ..

		(Yes, run the update script twice. And check that it did in fact parse "salve" correctly!)

		Now download and compile RFTagger:

		wget -qO- http://www.cis.uni-muenchen.de/~schmid/tools/RFTagger/data/RFTagger.tar.gz \| tar xvz
		cd RFTagger/src
		make
		sudo make install
		cd ../..

		And the patched version of LDT:

		git clone https://github.com/Alatius/treebank_data.git

		Convert the corpus and train RFTagger:

		./train-rftagger.sh

		The macronizer script stores the Morpheus analyses in a PostgreSQL database.
		So you will have to install a database server and create a database.
		The following works for me; your mileage may vary. Change theusername and
		thepassword to whatever you want.

		sudo apt-get install postgresql
		sudo -u postgres psql
		postgres=# create user theusername password 'thepassword';
		postgres=# create database macronizer encoding 'UTF8' owner theusername;
		postgres=# \q

		Install the psycopg2 Python module:

		sudo apt-get install python-psycopg2

		Now, edit the main script macronizer.py and set the constants in the beginning to the correct values:

		MACRONIZERLIB = '/path/to/the/latin-macronizer/'
		MORPHEUSDIR = MACRONIZERLIB+'morpheus/'
		RFTAGGERDIR = '/usr/local/bin/'
		DBNAME = 'macronizer'
		DBUSER = 'theusername'
		DBPASSWORD = 'thepassword'
		DBHOST = 'localhost'

		Finally, initialize the macronizer:

		python macronizer.py --initialize

		You can now test it with the following command

		python macronizer.py --test

		Good luck! In case of problems, contact me at johan.winge@gmail.com and I will do my best to assist you.

corpus2train.py

0 → 100755

+50 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python

		import xml.etree.ElementTree as ET
		import codecs
		import re

		corpuspath = "treebank_data/v1.6/latin/data/"
		treebankfile = codecs.open("ldt-corpus.txt","w","utf8")
		vocabularyfile = codecs.open("ldt-vocabulary.txt","w","utf8")
		vocabulary = set()

		xsegment = ""
		xsegmentbehind = ""
		for f in ["1999.02.0010",
		"2008.01.0002",
		"2007.01.0001",
		"1999.02.0060",
		"phi0448.phi001.perseus-lat1",
		"phi0620.phi001.perseus-lat1",
		"phi0959.phi006.perseus-lat1",
		"phi0690.phi003.perseus-lat1"]:
		bank = ET.parse(corpuspath+f+".tb.xml")
		for sentence in bank.getroot():
		for token in sentence.findall('word'):
		idnum = token.get('id','_')
		head = token.get('head','_')
		relation = token.get('relation','_')
		form = token.get('form','_')
		lemma = token.get('lemma',form)
		postag = token.get('postag','_')
		if form != "\|" and postag != "" and postag != "_":
		if lemma == "other" and relation == "XSEG" and int(head) == int(idnum) + 1:
		xsegment = form
		continue
		if (lemma == "que1" or lemma == "ne1") and relation == "XSEG" and int(head) == int(idnum) + 1:
		xsegmentbehind = form
		continue
		postag = '.'.join(list(postag))
		lemma = lemma.replace("#","").replace("1","").replace(" ","+")
		word = xsegment+form+xsegmentbehind
		treebankfile.write(word+"\t"+postag+"\t"+lemma+"\n")
		vocabulary.add(word)
		xsegment = ""
		xsegmentbehind = ""
		treebankfile.write(".\tu.-.-.-.-.-.-.-.-\tPERIOD1\n")
		treebankfile.write("\n")

		for word in vocabulary:
		vocabularyfile.write(word+"\n")

macronizer.py

0 → 100755

+758 −0

File added.

Preview size limit exceeded, changes collapsed.

morpheus2lexicon.py

0 → 100644

+52 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/python
		# -- coding: utf-8 --

		import postags
		import codecs

		morpheus = codecs.open("vocabulary-crunched.txt","r","utf8")
		lexicon = codecs.open("rftagger-lexicon.txt","w","utf8")

		tagtoaccents = {}

		for wordform in morpheus:
		wordform = wordform.strip()
		NLs = morpheus.next().strip()
		parses = []
		for NL in NLs.split("<NL>"):
		NL = NL.replace("</NL>","")
		NLparts = NL.split()
		if len(NLparts) > 0:
		parses += postags.Morpheus2Parses(wordform,NL)
		for parse in parses:
		lemma = parse[postags.LEMMA].replace("#","").replace("1","").replace(" ","+")
		accented = parse[postags.ACCENTEDFORM]
		tag = postags.Parse2LDT(parse)
		tagtoaccents[tag] = tagtoaccents.get(tag,[]) + [postags.unicodeaccents(accented)]
		tag = '.'.join(list(tag))
		lexicon.write(wordform + '\t' + tag + '\t' + lemma + '\n')

		def escapedaccents(txt):
		for replacement, source in [("a_",u"ā"),("e_",u"ē"),("i_",u"ī"),("o_",u"ō"),("u_",u"ū"),("y_",u"ȳ"),
		("A_",u"Ā"),("E_",u"Ē"),("I_",u"Ī"),("O_",u"Ō"),("U_",u"Ū"),("Y_",u"Ȳ")]:
		txt = txt.replace(source,replacement)
		return txt
		#enddef
		endingsfile = codecs.open("macronized-endings.txt","w","utf8")
		for tag in tagtoaccents:
		endingfreqs = {}
		for accented in tagtoaccents[tag]:
		for i in range(1,min(len(accented)-3, 12)):
		ending = accented[-i:]
		endingfreqs[ending] = endingfreqs.get(ending,0) + 1
		endingsfile.write(tag)
		relevantendings = []
		for ending in endingfreqs:
		endingwithoutmacrons = postags.removemacrons(ending)
		if ending[0] != endingwithoutmacrons[0] and endingfreqs[ending] > endingfreqs.get(endingwithoutmacrons, 1):
		relevantendings.append(ending)
		relevantendings.sort(lambda x,y: cmp(len(y), len(x)))
		for ending in relevantendings:
		endingsfile.write('\t' + escapedaccents(ending))
		endingsfile.write('\n')

postags.py

0 → 100644

+737 −0

File added.

Preview size limit exceeded, changes collapsed.