Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
target
target/*
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 21 additions & 5 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -37,31 +37,40 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>local</id>
<name>Local repository in project tree</name>
<url>file:${basedir}/lib</url>
</repository>
</repositories>


<dependencies>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-model</artifactId>
<version>2.2.4</version>
<version>2.7.5</version>
</dependency>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-rio-api</artifactId>
<version>2.2.4</version>
<version>2.7.5</version>
</dependency>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-rio-ntriples</artifactId>
<version>2.2.4</version>
<version>2.7.5</version>
</dependency>

<dependency>
Expand All @@ -75,6 +84,13 @@
<artifactId>commons-compress</artifactId>
<version>1.0</version>
</dependency>

<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jzlib</artifactId>
<version>1.1.2</version>
</dependency>


<dependency>
<groupId>net.sf.jopt-simple</groupId>
Expand Down
151 changes: 151 additions & 0 deletions pom.xml~
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>

<groupId>be.ugent.mmlab</groupId>
<artifactId>siren-eostool</artifactId>
<version>0.2-SNAPSHOT</version>
<name>eic-entity-tool</name>


<developers>
<developer>
<id>scampi</id>
<name>Stephane Campinas</name>
<email>stephane.campinas::at::deri.org</email>
<organization>DERI Galway</organization>
<organizationUrl>http://www.deri.ie/</organizationUrl>
<roles>
<role>Developer</role>
</roles>
</developer>
</developers>

<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<descriptors>
<descriptor>assembly.xml</descriptor>
</descriptors>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<source>1.7</source>
<target>1.6</target>
</configuration>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>local</id>
<name>Local repository in project tree</name>
<url>file:${basedir}/lib</url>
</repository>
</repositories>


<dependencies>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-model</artifactId>
<version>2.2.4</version>
</dependency>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-rio-api</artifactId>
<version>2.2.4</version>
</dependency>

<dependency>
<groupId>org.openrdf.sesame</groupId>
<artifactId>sesame-rio-ntriples</artifactId>
<version>2.2.4</version>
</dependency>

<dependency>
<groupId>org.sindice.siren</groupId>
<artifactId>siren-core</artifactId>
<version>0.2.1-SNAPSHOT</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.0</version>
</dependency>

<dependency>
<groupId>net.sf.jopt-simple</groupId>
<artifactId>jopt-simple</artifactId>
<version>3.2-rc1</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>

<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>0.9.28</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.1.0</version>
</dependency>

<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.3</version>
</dependency>

<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>3.5.0</version>
</dependency>

<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>

<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.5</version>
</dependency>

<dependency>
<groupId>org.sindice.siren</groupId>
<artifactId>siren-core</artifactId>
<version>0.2.3-RC2</version>
</dependency>

<dependency>
<groupId>org.sindice.siren</groupId>
<artifactId>siren-solr</artifactId>
<version>0.2.3-RC2</version>
</dependency>

</dependencies>

</project>
183 changes: 183 additions & 0 deletions required_siren_schema.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
<?xml version="1.0" encoding="UTF-8" ?>

<!--
This is the SIREn/Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.

This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.

For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml

PERFORMANCE NOTE: this schema includes many optional features and should not
be used for benchmarking. To improve performance one could
- set stored="false" for all fields possible (esp large fields) when you
only need to search on the field but don't need to return the original
value.
- set indexed="false" if you don't need to search on the field, but only
return the field as a result of searching on other indexed fields.
- remove all unneeded copyField statements
- for best index size and searching performance, set "index" to false
for all general text fields, use copyField to copy them to the
catchall "text" field, and use that for searching.
- For maximum indexing performance, use the StreamingUpdateSolrServer
java client.
- Remember to run the JVM in server mode, and use a higher logging level
that avoids logging every request
-->

<schema name="example" version="1.3">

<types>

<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>

<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>

<!-- A uri field that uses WhitespaceTokenizer and WordDelimiterFilter to
split URIs into multiple compoenents. Stopwords is customized by
external files.
omitNorms is true since it is a short field, and it does not make
really sense on URI.
Does not use the ASCIIFoldingExpansionFilter since URIs should not
contain accented characters.
-->
<fieldType name="uri" class="solr.TextField" omitNorms="true" positionIncrementGap="100">
<analyzer type="index">

<tokenizer class="solr.WhitespaceTokenizerFactory"/>

<!-- Splits words into subwords based on delimiters
- split subwords based on case change
- preserveOriginal="1" in order to preserve the original word.
Removed split based on numerics to fix SND-355 and SND-1283
-->
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"
splitOnNumerics="0"
preserveOriginal="1"/>

<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>

<!-- Change to lowercase text -->
<filter class="solr.LowerCaseFilterFactory"/>

<!-- Case insensitive stop word removal.
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>

</analyzer>
<analyzer type="query">
<!-- whitespace tokenizer to not tokenize URI -->
<tokenizer class="solr.WhitespaceTokenizerFactory"/>

<!-- Filters out those tokens *not* having length min through max
inclusive. -->
<filter class="solr.LengthFilterFactory" min="2" max="256"/>

<filter class="solr.LowerCaseFilterFactory"/>

<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>

<!-- Replace Qnames by their name spaces in URIs. -->
<filter class="org.sindice.siren.solr.analysis.QNamesFilterFactory"
qnames="qnames.txt"/>

</analyzer>
</fieldType>

<!--
The SIREn field type:
The top-level analyzers must be defined in the top-level analyzer
configuration file (ntriple-analyzers.xml) and the datatype analyzers in
the datatype analyzer configuration file (ntriples-datatypes.xml).

Field norms are not useful for SIREn fields. Set omitNorms to true reduces
memory consumption, and improve ranking.

omitTermFreqAndPositions *must* be set to false.
-->
<fieldType name="ntriple" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>

<fieldType name="tabular" class="org.sindice.siren.solr.schema.SirenField"
omitNorms="true"
omitTermFreqAndPositions="false"
analyzerConfig="tuple-analyzers.xml"
datatypeConfig="tuple-datatypes.xml"/>

</types>


<fields>

<!-- The ID (URL) of the document
Use the 'string' field type (no tokenisation)
-->
<field name="id" type="string" indexed="true" stored="true" required="true"/>

<field name="label" type="string" indexed="true" stored="true" required="false"/>

<field name="description" type="string" indexed="true" stored="true" required="false"/>

<!-- The URL of the document
Use the 'text' field type in order to be tokenised
-->
<field name="url" type="uri" indexed="true" stored="true" required="true"/>

<field name="type" type="ntriple" indexed="true" stored="true" required="false"/>

<!-- n-triple indexing scheme -->
<field name="ntriple" type="ntriple" indexed="true" stored="true" multiValued="false"/>

<!-- tabular indexing scheme -->
<field name="tabular" type="tabular" indexed="true" stored="false" multiValued="false"/>

</fields>

<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>

<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>ntriple</defaultSearchField>

<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="AND"/>

<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="url" dest="id"/>

<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity for Siren is specified here -->
<similarity class="org.sindice.siren.similarity.SirenSimilarity"/>

</schema>
Loading