cligs · christofs · Jun 1, 2015 · Jun 1, 2015 · Jun 1, 2015 · Jun 6, 2015
diff --git a/README.md b/README.md
@@ -1,8 +1,4 @@
-toolbox
-=======
+toolbox: Christof's branch
+==========================
 
-Collection of small work-in-progress scripts and code snippets for text processing produced by CLiGS.
-
-Each folder contains one or several Python scripts and some sample texts for testing.
-
-Note that all functions are designed for Python 3 and are experimental in nature and quality.
+This branch is kept mostly up-to-date with the master branch, except that paths may be specific to Christof's system.
diff --git a/__init__.py b/__init__.py
diff --git a/__pycache__/__init__.cpython-34.pyc b/__pycache__/__init__.cpython-34.pyc
diff --git a/__pycache__/extract.cpython-34.pyc b/__pycache__/extract.cpython-34.pyc
diff --git a/activate_toolbox.py b/activate_toolbox.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Filename: activate_toolbox.py
+# Author: Christof Schöch
+
+"""
+# Script to add the "toolbox" module to your syspath. 
+# Run once with appropriate path to use the toolbox with "import".
+"""
+
+import sys
+import os
+
+## Enter the path to the folder in which your toolbox folder is located.
+## Example: sys.path.append(os.path.abspath("/home/christof/Repos/cligs/"))
+sys.path.append(os.path.abspath("/home/christof/Repos/cligs/"))
+#sys.path.append(os.path.abspath("/usr/local/lib/python3.4/dist-packages/"))
+#sys.path.append(os.path.abspath("/usr/local/lib/python3.4/dist-packages/numpy-1.10.4.egg"))
+
+## Optional: Activate to remove a (mistaken or redundant path)    
+#sys.path.remove(os.path.abspath("/usr/local/lib/python3.4/dist-packages/"))
+#sys.path.remove(os.path.abspath("/home/christof/Repos/cligs/"))
+#sys.path.remove(os.path.abspath("/usr/lib/python3.4/"))
+#sys.path.remove(os.path.abspath("/usr/local/lib/python3.4/dist-packages/numpy-1.10.4.egg"))
+
+## This is for checking whether the path settings are correct.
+print(sys.path)
diff --git a/analyse/__init__.py b/analyse/__init__.py
diff --git a/annotate/__init__.py b/annotate/__init__.py
diff --git a/annotate/annotate_fw.py b/annotate/annotate_fw.py
diff --git a/annotate/fw2txm.py b/annotate/fw2txm.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Filename: fw2txm.py
+# Authors: #cf
+# 2016-05-20
+
+"""
+Functions to convert Freeling+WordNet format for import into TXM.
+"""
+
+import re
+import os
+import glob
+
+
+WorkDir = "/media/christof/data/Dropbox/0-Analysen/2016/wordnet/"
+#WorkDir = "/home/christof/Dropbox/0-Analysen/2016/wordnet/"
+InPath = WorkDir+"wn/*.xml"
+TXMFolder = WorkDir+"txm/"
+
+
+
+def fw2txm(InPath, TXMFolder):
+    """
+    Transform Freeling+Wordnet output to format suitable for import in TXM.
+    Author: #cf.
+    """
+    print("fw2txm...")
+
+    if not os.path.exists(TXMFolder):
+        os.makedirs(TXMFolder)
+
+
+    for File in glob.glob(InPath): 
+        with open(File, "r") as InFile: 
+            Filename = os.path.basename(File)
+            Text = InFile.read()
+            TXMText = re.sub("<token","<w", Text)
+            TXMText = re.sub("</token>","</w>", TXMText)
+            with open(TXMFolder+Filename, "w") as OutFile: 
+                OutFile.write(TXMText)
+
+    print("Done.")
+
+
+fw2txm(InPath, TXMFolder)
+
+
+
diff --git a/annotate/prepare_tei.py b/annotate/prepare_tei.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+
+Submodule which prepares CLiGS-TEI-files for annotation with e.g. FreeLing and NLTK WordNet.
+After the annotation (external to this module), the annotated files are brought together in new TEI files.
+
+Check out the documentation for the functions prepare_input and prepare_output for more details.
+
+- for chapterwise annotation
+- just the body text is preserved
+- headings, notes and inline markup are discarded
+
+@author: Ulrike Henny
+@filename: prepare_tei.py
+
+"""
+
+import os
+import glob
+import sys
+import re
+import io
+from lxml import etree
+from pathlib import Path
+
+
+class FileResolver(etree.Resolver):
+	def resolve(self, url, pubid, context):
+		return self.resolve_filename(url, context)
+
+
+# XSLT snippets
+xslt_TEIwrapper = etree.XML('''\
+	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" version="1.0">
+
+		<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
+
+		<xsl:variable name="cligsID" select="//tei:idno[@type='cligs']"/>
+
+		<xsl:template match="/">
+			<xsl:processing-instruction name="xml-model">href="https://raw.githubusercontent.com/cligs/reference/master/cligs-annotated/cligs_annotated.rnc" type="application/relax-ng-compact-syntax"</xsl:processing-instruction>
+
+            <TEI xmlns="http://www.tei-c.org/ns/1.0">
+                <xsl:apply-templates select="tei:TEI/tei:teiHeader"/>
+                <text>
+                    <body>
+                        <xsl:apply-templates select="tei:TEI/tei:text/tei:body"/>
+                    </body>
+                </text>
+            </TEI>
+		</xsl:template>
+
+		<xsl:template match="tei:div[ancestor::tei:body][not(descendant::tei:div) and not(ancestor::tei:floatingText)]">
+			<xsl:copy>
+				<xsl:attribute name="xml:id"><xsl:value-of select="$cligsID"/>_d<xsl:value-of select="count(preceding::tei:div[ancestor::tei:body][not(descendant::tei:div) and not(ancestor::tei:floatingText)]) + 1"/></xsl:attribute>
+			</xsl:copy>
+		</xsl:template>
+
+		<xsl:template match="tei:idno[@type='cligs']">
+			<xsl:copy>
+				<xsl:copy-of select="@*"/>
+				<xsl:value-of select="concat(.,'a')"/>
+			</xsl:copy>
+		</xsl:template>
+
+		<xsl:template match="tei:teiHeader | tei:teiHeader//node()[not(self::tei:idno[@type='cligs'])] | tei:teiHeader//@* | tei:teiHeader//processing-instruction() | tei:teiHeader//comment()">
+			<xsl:copy>
+				<xsl:apply-templates select="node() | @* | processing-instruction() | comment()"/>
+			</xsl:copy>
+		</xsl:template>
+
+		<xsl:template match="text()[not(ancestor::tei:teiHeader)]"/>
+
+	</xsl:stylesheet>
+	''')
+
+xslt_extractDIVs = etree.XML('''\
+	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" version="1.0">
+
+		<xsl:output method="text" encoding="UTF-8" indent="yes"/>
+
+		<xsl:template match="tei:head|tei:note">
+			<xsl:text> </xsl:text>
+		</xsl:template>
+
+		<xsl:template match="tei:*[not(name() = 'head') and not(name() = 'note')]">
+			<xsl:text> </xsl:text><xsl:apply-templates /><xsl:text> </xsl:text>
+		</xsl:template>
+
+		<xsl:template match="text()">
+			<xsl:value-of select="normalize-space(.)"/>
+		</xsl:template>
+
+	</xsl:stylesheet>
+	''')
+
+xslt_joinDIVs = '''\
+	<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0" version="1.0">
+
+		<xsl:param name="annofolder"/>
+
+		<xsl:output method="xml" encoding="UTF-8" indent="yes" />
+
+		<xsl:template match="node() | @* | processing-instruction() | comment()">
+			<xsl:copy>
+				<xsl:apply-templates select="node() | @* | processing-instruction() | comment()"/>
+			</xsl:copy>
+		</xsl:template>
+
+		<xsl:template match="tei:text/tei:body/tei:div">
+			<xsl:copy>
+				<xsl:copy-of select="@*"/>
+				<xsl:copy-of select="document(concat($annofolder, @xml:id,'_a.xml'))//body"/>
+			</xsl:copy>
+		</xsl:template>
+
+	</xsl:stylesheet>
+	'''
+
+
+
+def prepare_anno(infolder, outfolder):
+	"""
+	Takes a collection of TEI files and prepares them for annotation (chapterwise).
+
+	Arguments:
+	infolder (string): path to the input folder (which should contain the input TEI files)
+	outfolder (string): path to the output folder (which is created if it does not exist)
+	"""
+	print("Starting...")
+
+	inpath = os.path.join(infolder, "*.xml")
+	filecounter = 0
+
+	# check output folders
+	if not os.path.exists(outfolder):
+		os.makedirs(outfolder)
+
+	out_tei = os.path.join(outfolder, "temp")
+	out_txt = os.path.join(outfolder, "txt")
+
+	if not os.path.exists(out_tei):
+		os.makedirs(out_tei)
+	if not os.path.exists(out_txt):
+		os.makedirs(out_txt)
+
+
+	for filepath in glob.glob(inpath):
+		filecounter+= 1
+		fn = os.path.basename(filepath)[:-4]
+		outfile_x = fn + "_a.xml"
+
+		doc = etree.parse(filepath)
+
+		transform = etree.XSLT(xslt_TEIwrapper)
+		result_tree = transform(doc)
+		result = str(result_tree)
+
+		# create TEI wrapper for future annotation results
+		with open(os.path.join(outfolder, "temp", outfile_x), "w") as output:
+			output.write(result)
+
+		# create one full text file per chapter
+		tei = {'tei':'http://www.tei-c.org/ns/1.0'}
+		#cligs_id = doc.xpath("//tei:idno[@type='cligs']/text()", namespaces=tei)
+		cligs_id = fn
+		print(cligs_id)
+		results = doc.xpath("//tei:div[ancestor::tei:body][not(descendant::tei:div) and not(ancestor::tei:floatingText)]", namespaces=tei)
+
+		for i,r in enumerate(results):
+			transform = etree.XSLT(xslt_extractDIVs)
+			result_tree = transform(r)
+			result = str(result_tree)
+
+			#print(cligs_id)
+			#outfile = cligs_id[0] + "_d" + str(i + 1) + ".txt"
+			outfile = "_d" + str(i + 1) + ".txt"
+
+			with open(os.path.join(outfolder, "txt", outfile), "w") as output:
+				output.write(result)
+
+	print("Done. " + str(filecounter) + " files treated.")
+
+
+
+
+def postpare_anno(infolder, outfolder):
+	"""
+	Creates a TEI file from a collection of annotated full text files (one per chapter).
+	Needs an input folder with two subfolders: 'temp' with the TEI file templates and 'anno' with the annotated text in XML format.
+	Expects the annotated files to be named according to the following example/pattern: nh0006_d1_a.xml / [cligs_id]_d[division_id]_a.xml
+
+	Arguments:
+	infolder (string): path to the input folder (which should contain a folder "temp" with the templates for the new TEI files and a folder "anno" with the annotations in XML format)
+	outfolder (string): path to the output folder (which is created if it does not exist)
+	"""
+	print("Starting...")
+
+	if not os.path.exists(infolder):
+		raise ValueError("The input folder could not be found.")
+
+	in_temp = os.path.join(infolder, "temp")
+	in_anno = os.path.join(infolder, "anno")
+
+	if not os.path.exists(in_temp):
+		raise ValueError("The folder 'temp' could not be found inside the input folder.")
+	if not os.path.exists(in_anno):
+		raise ValueError("The folder 'anno' could not be found inside the input folder.")
+	if not os.path.exists(outfolder):
+		os.makedirs(outfolder)
+
+	filecounter = 0	
+
+	# fetch annotated snippets for each TEI template file
+	for filepath in glob.glob(os.path.join(in_temp, "*.xml")):
+		print("doing file " + filepath)
+		filecounter+= 1
+		fn = os.path.basename(filepath)
+		annofolder = os.path.join(Path(os.path.join(infolder, "anno")).as_uri(), "")
+
+		parser = etree.XMLParser(encoding="UTF-8")
+		parser.resolvers.add(FileResolver())
+
+		doc = etree.parse(filepath, parser)
+		xslt_root = etree.parse(io.StringIO(xslt_joinDIVs), parser)
+
+		transform = etree.XSLT(xslt_root)
+
+		result_tree = transform(doc, annofolder= "'" + annofolder + "'")
+		result = str(result_tree)
+		result = re.sub("<body xmlns=\"\">","<p>", result)
+		result = re.sub("</s>\n</body>","</s></p>", result)
+
+		# save the results
+		with open(os.path.join(outfolder, fn), "w") as output:
+			output.write(result)
+
+	print("Done. " + str(filecounter) + " files treated.")
+
+
+
+
+def prepare(mode, infolder, outfolder):
+	"""
+	Preparations for linguistically annotated versions of a collection of TEI files.
+	There are two phases:
+	- input phase: the full text is extracted chapterwise from the TEI files, templates for new TEI files meant to hold the annotated text are created
+	- output phase: the annotated full text snippets are brought together in the new TEI files
+
+	Arguments:
+	mode(string): possible values are "split" or "merge"
+	infolder (string): in split-mode: path to the input folder (which should contain the input TEI files); in merge-mode: path to the annotation output folder (with subfolder "temp" and "anno")
+	outfolder (string): in split-mode: path to the output folder for annotation working files; in merge-mode: path to the output folder for annotated TEI result files. The folders are created if they do not exist.
+	"""
+	if mode == "split":
+		prepare_anno(infolder, outfolder)
+	elif mode == "merge":
+		postpare_anno(infolder, outfolder)
+	else:
+		raise ValueError("Please indicate one of the following as the value for the first argument: 'split', 'merge'")
+
+
+
+if __name__ == "__main__":
+	prepare(int(sys.argv[1]))
+
+
diff --git a/annotate/results.csv b/annotate/results.csv