scoppens · mielvds · Dec 7, 2012 · Feb 14, 2013 · Mar 21, 2013 · Mar 21, 2013
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+target
+target/*
diff --git a/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar b/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar
diff --git a/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar
diff --git a/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar
diff --git a/pom.xml b/pom.xml
@@ -37,31 +37,40 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>2.5.1</version>
         <configuration>
-          <source>1.7</source>
-          <target>1.7</target>
+          <source>1.6</source>
+          <target>1.6</target>
         </configuration>
       </plugin> 
         </plugins>
     </build>
 
+    <repositories>
+        <repository>
+            <id>local</id>
+            <name>Local repository in project tree</name>
+            <url>file:${basedir}/lib</url>
+        </repository>
+    </repositories>
+
+
     <dependencies>
 
         <dependency>
             <groupId>org.openrdf.sesame</groupId>
             <artifactId>sesame-model</artifactId>
-            <version>2.2.4</version>
+            <version>2.7.5</version>
         </dependency>
 
         <dependency>
             <groupId>org.openrdf.sesame</groupId>
             <artifactId>sesame-rio-api</artifactId>
-            <version>2.2.4</version>
+            <version>2.7.5</version>
         </dependency>
 
         <dependency>
             <groupId>org.openrdf.sesame</groupId>
             <artifactId>sesame-rio-ntriples</artifactId>
-            <version>2.2.4</version>
+            <version>2.7.5</version>
         </dependency>
 
         <dependency>
@@ -75,6 +84,13 @@
             <artifactId>commons-compress</artifactId>
             <version>1.0</version>
         </dependency>
+
+        <dependency>
+	<groupId>com.jcraft</groupId>
+	<artifactId>jzlib</artifactId>
+	<version>1.1.2</version>
+</dependency>
+
 
         <dependency>
             <groupId>net.sf.jopt-simple</groupId>

diff --git a/pom.xml~ b/pom.xml~
@@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+    xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>be.ugent.mmlab</groupId>
+    <artifactId>siren-eostool</artifactId>
+    <version>0.2-SNAPSHOT</version>
+    <name>eic-entity-tool</name>
+
+
+    <developers>
+        <developer>
+        <id>scampi</id>
+        <name>Stephane Campinas</name>
+        <email>stephane.campinas::at::deri.org</email>
+        <organization>DERI Galway</organization>
+        <organizationUrl>http://www.deri.ie/</organizationUrl>
+        <roles>
+            <role>Developer</role>
+        </roles>
+        </developer>
+    </developers>
+
+    <build>
+        <plugins>
+        <plugin>
+            <artifactId>maven-assembly-plugin</artifactId>
+            <configuration>
+            <descriptors>
+                <descriptor>assembly.xml</descriptor>
+            </descriptors>
+            </configuration>
+        </plugin>
+		      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>2.5.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin> 
+        </plugins>
+    </build>
+
+    <repositories>
+        <repository>
+            <id>local</id>
+            <name>Local repository in project tree</name>
+            <url>file:${basedir}/lib</url>
+        </repository>
+    </repositories>
+
+
+    <dependencies>
+
+        <dependency>
+            <groupId>org.openrdf.sesame</groupId>
+            <artifactId>sesame-model</artifactId>
+            <version>2.2.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.openrdf.sesame</groupId>
+            <artifactId>sesame-rio-api</artifactId>
+            <version>2.2.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.openrdf.sesame</groupId>
+            <artifactId>sesame-rio-ntriples</artifactId>
+            <version>2.2.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.sindice.siren</groupId>
+            <artifactId>siren-core</artifactId>
+            <version>0.2.1-SNAPSHOT</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-compress</artifactId>
+            <version>1.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>net.sf.jopt-simple</groupId>
+            <artifactId>jopt-simple</artifactId>
+            <version>3.2-rc1</version>
+        </dependency>
+
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.8.1</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>ch.qos.logback</groupId>
+            <artifactId>logback-classic</artifactId>
+            <version>0.9.28</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>3.1.0</version>
+        </dependency>
+
+        <dependency>
+            <groupId>commons-lang</groupId>
+            <artifactId>commons-lang</artifactId>
+            <version>2.3</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.solr</groupId>
+            <artifactId>solr-solrj</artifactId>
+            <version>3.5.0</version>
+        </dependency>
+
+        <dependency>
+   			<groupId>commons-logging</groupId>
+   			<artifactId>commons-logging</artifactId>
+   			<version>1.1.1</version>
+		</dependency>
+
+		<dependency>
+   			<groupId>commons-codec</groupId>
+   			<artifactId>commons-codec</artifactId>
+   			<version>1.5</version>
+		</dependency>
+
+		<dependency>
+   			<groupId>org.sindice.siren</groupId>
+   			<artifactId>siren-core</artifactId>
+   			<version>0.2.3-RC2</version>
+		</dependency>
+
+		<dependency>
+   			<groupId>org.sindice.siren</groupId>
+   			<artifactId>siren-solr</artifactId>
+   			<version>0.2.3-RC2</version>
+		</dependency>
+
+    </dependencies>
+
+</project>
diff --git a/required_siren_schema.xml b/required_siren_schema.xml
@@ -0,0 +1,183 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+
+<!--  
+ This is the SIREn/Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default) 
+ or located where the classloader for the Solr webapp can find it.
+
+ This example schema is the recommended starting point for users.
+ It should be kept correct and concise, usable out-of-the-box.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+
+ PERFORMANCE NOTE: this schema includes many optional features and should not
+ be used for benchmarking.  To improve performance one could
+  - set stored="false" for all fields possible (esp large fields) when you
+    only need to search on the field but don't need to return the original
+    value.
+  - set indexed="false" if you don't need to search on the field, but only
+    return the field as a result of searching on other indexed fields.
+  - remove all unneeded copyField statements
+  - for best index size and searching performance, set "index" to false
+    for all general text fields, use copyField to copy them to the
+    catchall "text" field, and use that for searching.
+  - For maximum indexing performance, use the StreamingUpdateSolrServer
+    java client.
+  - Remember to run the JVM in server mode, and use a higher logging level
+    that avoids logging every request
+-->
+
+<schema name="example" version="1.3">
+
+  <types>
+
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+    <!-- A Trie based date field for faster date range queries and date faceting. -->
+    <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
+
+    <!-- A uri field that uses WhitespaceTokenizer and WordDelimiterFilter to 
+         split URIs into multiple compoenents.  Stopwords is customized by 
+         external files.
+         omitNorms is true since it is a short field, and it does not make 
+         really sense on URI.
+         Does not use the ASCIIFoldingExpansionFilter since URIs should not
+         contain accented characters.
+    -->
+    <fieldType name="uri" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
+      <analyzer type="index">
+
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+
+        <!-- Splits words into subwords based on delimiters
+             - split subwords based on case change
+             - preserveOriginal="1" in order to preserve the original word.
+             Removed split based on numerics to fix SND-355 and SND-1283 
+        -->
+        <filter class="solr.WordDelimiterFilterFactory" 
+                generateWordParts="1" 
+                generateNumberParts="1" 
+                catenateWords="0" 
+                catenateNumbers="0" 
+                catenateAll="0" 
+                splitOnCaseChange="1"
+                splitOnNumerics="0"
+                preserveOriginal="1"/>
+
+        <!-- Filters out those tokens *not* having length min through max 
+             inclusive. -->
+        <filter class="solr.LengthFilterFactory" min="2" max="256"/>
+
+        <!-- Change to lowercase text -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+
+      </analyzer>
+      <analyzer type="query">
+        <!-- whitespace tokenizer to not tokenize URI -->
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+
+        <!-- Filters out those tokens *not* having length min through max 
+             inclusive. -->
+        <filter class="solr.LengthFilterFactory" min="2" max="256"/>
+
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords.txt"
+                enablePositionIncrements="true"
+                />
+
+        <!-- Replace Qnames by their name spaces in URIs. -->
+        <filter class="org.sindice.siren.solr.analysis.QNamesFilterFactory" 
+                qnames="qnames.txt"/>
+
+      </analyzer>
+    </fieldType>
+
+    <!--
+		  The SIREn field type:
+			The top-level analyzers must be defined in the top-level analyzer 
+      configuration file (ntriple-analyzers.xml) and the datatype analyzers in 
+      the datatype analyzer configuration file (ntriples-datatypes.xml).  
+
+			Field norms are not useful for SIREn fields. Set omitNorms to true reduces
+			memory consumption, and improve ranking.
+
+      omitTermFreqAndPositions *must* be set to false.
+		-->
+    <fieldType name="ntriple" class="org.sindice.siren.solr.schema.SirenField"
+               omitNorms="true" 
+               omitTermFreqAndPositions="false"
+               analyzerConfig="tuple-analyzers.xml"
+               datatypeConfig="tuple-datatypes.xml"/>
+
+    <fieldType name="tabular" class="org.sindice.siren.solr.schema.SirenField"
+               omitNorms="true" 
+               omitTermFreqAndPositions="false"
+               analyzerConfig="tuple-analyzers.xml"
+               datatypeConfig="tuple-datatypes.xml"/>
+
+ </types>
+
+
+ <fields>
+
+ 	 	<!-- The ID (URL) of the document 
+	     Use the 'string' field type (no tokenisation)
+	-->
+   	<field name="id" type="string" indexed="true" stored="true" required="true"/>
+
+	<field name="label" type="string" indexed="true" stored="true" required="false"/>
+
+	<field name="description" type="string" indexed="true" stored="true" required="false"/>
+
+ 	<!-- The URL of the document 
+	     Use the 'text' field type in order to be tokenised
+	-->
+   	<field name="url" type="uri" indexed="true" stored="true" required="true"/>
+
+	<field name="type" type="ntriple" indexed="true" stored="true" required="false"/>
+
+	<!-- n-triple indexing scheme -->
+   	<field name="ntriple" type="ntriple" indexed="true" stored="true" multiValued="false"/>
+
+        <!-- tabular indexing scheme -->
+        <field name="tabular" type="tabular" indexed="true" stored="false" multiValued="false"/>
+
+ </fields>
+
+ <!-- Field to use to determine and enforce document uniqueness. 
+      Unless this field is marked with required="false", it will be a required field
+   -->
+ <uniqueKey>id</uniqueKey>
+
+ <!-- field for the QueryParser to use when an explicit fieldname is absent -->
+ <defaultSearchField>ntriple</defaultSearchField>
+
+ <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
+ <solrQueryParser defaultOperator="AND"/>
+
+ <!-- copyField commands copy one field to another at the time a document
+      is added to the index.  It's used either to index the same field differently,
+      or to add multiple fields to the same field for easier/faster searching.  -->
+ <copyField source="url" dest="id"/>
+
+<!-- Similarity is the scoring routine for each document vs. a query.
+     A custom similarity for Siren is specified here  -->
+<similarity class="org.sindice.siren.similarity.SirenSimilarity"/>
+
+</schema>