diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c043303
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+target
+target/*
diff --git a/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar b/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar
new file mode 100644
index 0000000..c706985
Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar differ
diff --git a/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar
new file mode 100644
index 0000000..ac4a82f
Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar differ
diff --git a/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar
new file mode 100644
index 0000000..80d1ef3
Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar differ
diff --git a/pom.xml b/pom.xml
index 3a119bd..392a448 100644
--- a/pom.xml
+++ b/pom.xml
@@ -37,31 +37,40 @@
maven-compiler-plugin
2.5.1
- 1.7
- 1.7
+ 1.6
+ 1.6
+
+
+ local
+ Local repository in project tree
+ file:${basedir}/lib
+
+
+
+
org.openrdf.sesame
sesame-model
- 2.2.4
+ 2.7.5
org.openrdf.sesame
sesame-rio-api
- 2.2.4
+ 2.7.5
org.openrdf.sesame
sesame-rio-ntriples
- 2.2.4
+ 2.7.5
@@ -75,6 +84,13 @@
commons-compress
1.0
+
+
+ com.jcraft
+ jzlib
+ 1.1.2
+
+
net.sf.jopt-simple
diff --git a/pom.xml~ b/pom.xml~
new file mode 100644
index 0000000..c4e2d76
--- /dev/null
+++ b/pom.xml~
@@ -0,0 +1,151 @@
+
+
+ 4.0.0
+
+ be.ugent.mmlab
+ siren-eostool
+ 0.2-SNAPSHOT
+ eic-entity-tool
+
+
+
+
+ scampi
+ Stephane Campinas
+ stephane.campinas::at::deri.org
+ DERI Galway
+ http://www.deri.ie/
+
+ Developer
+
+
+
+
+
+
+
+ maven-assembly-plugin
+
+
+ assembly.xml
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 2.5.1
+
+ 1.7
+ 1.6
+
+
+
+
+
+
+
+ local
+ Local repository in project tree
+ file:${basedir}/lib
+
+
+
+
+
+
+
+ org.openrdf.sesame
+ sesame-model
+ 2.2.4
+
+
+
+ org.openrdf.sesame
+ sesame-rio-api
+ 2.2.4
+
+
+
+ org.openrdf.sesame
+ sesame-rio-ntriples
+ 2.2.4
+
+
+
+ org.sindice.siren
+ siren-core
+ 0.2.1-SNAPSHOT
+
+
+
+ org.apache.commons
+ commons-compress
+ 1.0
+
+
+
+ net.sf.jopt-simple
+ jopt-simple
+ 3.2-rc1
+
+
+
+ junit
+ junit
+ 4.8.1
+ test
+
+
+
+ ch.qos.logback
+ logback-classic
+ 0.9.28
+
+
+
+ org.apache.lucene
+ lucene-core
+ 3.1.0
+
+
+
+ commons-lang
+ commons-lang
+ 2.3
+
+
+
+ org.apache.solr
+ solr-solrj
+ 3.5.0
+
+
+
+ commons-logging
+ commons-logging
+ 1.1.1
+
+
+
+ commons-codec
+ commons-codec
+ 1.5
+
+
+
+ org.sindice.siren
+ siren-core
+ 0.2.3-RC2
+
+
+
+ org.sindice.siren
+ siren-solr
+ 0.2.3-RC2
+
+
+
+
+
diff --git a/required_siren_schema.xml b/required_siren_schema.xml
new file mode 100644
index 0000000..70a477f
--- /dev/null
+++ b/required_siren_schema.xml
@@ -0,0 +1,183 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ id
+
+
+ ntriple
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/main/java/org/sindice/siren/index/Entity.java b/src/main/java/org/sindice/siren/index/Entity.java
index c3370de..463b171 100644
--- a/src/main/java/org/sindice/siren/index/Entity.java
+++ b/src/main/java/org/sindice/siren/index/Entity.java
@@ -38,6 +38,8 @@ public class Entity {
final StringBuilder sbMetadata = new StringBuilder();
/* rdf:type statement's objects */
final HashSet type = new HashSet();
+ final HashSet label = new HashSet();
+ final HashSet description = new HashSet();
final StringBuilder sb = new StringBuilder();
@@ -51,6 +53,8 @@ public void clear() {
outTuples.clear();
sb.setLength(0);
type.clear();
+ label.clear();
+ description.clear();
sbMetadata.setLength(0);
}
@@ -60,8 +64,11 @@ public String getTriples(boolean out) {
sb.setLength(0);
for (Entry> e : map.entrySet()) {
for (String s : e.getValue()){
- sb.append(subject).append(' ').append(e.getKey()).append(' ').append(s).append(" .\n");
- }
+ if ( (subject.contains("<") && subject.contains(">")) || subject.indexOf('_') == 0)
+ sb.append(subject).append(' ').append(e.getKey()).append(' ').append(s).append(" .\n");
+ else
+ sb.append('<').append(subject).append('>').append(' ').append(e.getKey()).append(' ').append(s).append(" .\n");
+ }
}
return sb.toString();
}
diff --git a/src/main/java/org/sindice/siren/index/IndexingMMLab.java b/src/main/java/org/sindice/siren/index/IndexingMMLab.java
index a3bf3c9..a84e61b 100644
--- a/src/main/java/org/sindice/siren/index/IndexingMMLab.java
+++ b/src/main/java/org/sindice/siren/index/IndexingMMLab.java
@@ -13,7 +13,7 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
-import java.util.zip.GZIPInputStream;
+//import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
@@ -22,11 +22,14 @@
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
+import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.jcraft.jzlib.*;
+
/**
* Index a list of entities, creating incoming, outgoing triples fields, subject
* and type fields. The type field is a grouping of the rdf:type objects for this
@@ -50,6 +53,8 @@ public abstract class IndexingMMLab implements Iterator {
final static public String URL = "url";
final static public String NTRIPLE = "ntriple";
final static public String TYPE = "type";
+ final static public String LABEL = "label";
+ final static public String DESCRIPTION = "description";
/* The dataset files */
protected final File[] input;
@@ -98,7 +103,7 @@ public boolean accept(File dir, String name) {
this.indexURL = url;
- server = new CommonsHttpSolrServer(indexURL);
+ server = new StreamingUpdateSolrServer(indexURL, COMMIT, 32);
// Clear the index
if (CLEAR){
clear();
@@ -183,11 +188,15 @@ public void indexIt()
final SolrInputDocument document = new SolrInputDocument();
document.addField(URL, StringUtils.strip(entity.subject, "<>"));
document.addField(NTRIPLE, cleanup(entity.getTriples(true)));
- //document.addField(TYPE, Utils.toString(entity.type));
-
- add(document);
-
- counter = commit(true, counter, entity.subject);
+ document.addField(TYPE, Utils.toString(entity.type));
+ document.addField(LABEL, Utils.toString(entity.label));
+ document.addField(DESCRIPTION, Utils.toString(entity.description));
+ try {
+ add(document);
+ counter = commit(true, counter, entity.subject);
+ } catch (Exception e) {
+ logger.error("Error while processing the document: {}", e);
+ }
}
commit(false, counter, entity.subject); // Commit what is left
}
diff --git a/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java b/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java
index 841cc35..59d4524 100644
--- a/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java
+++ b/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java
@@ -91,7 +91,7 @@ public Entity next() {
entityByteSize += tarEntry.getSize();
Utils.getFile(reader, tarEntry.getSize(), entity.sb);
// Strip outgoing triples from rdf:type statements
- Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, true);
+ Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, entity.label, entity.description, true);
/*
* incoming-triples.nt
*/
@@ -107,7 +107,7 @@ public Entity next() {
entity.inTuples.clear();
} else {
Utils.getFile(reader, tarEntry.getSize(), entity.sb);
- Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, false);
+ Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, null, null, false);
}
} while (hasNext(entityID)); // while documents describe the same entity
} catch (IOException e) {
diff --git a/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java b/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java
index 621ef4e..b9ea142 100644
--- a/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java
+++ b/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java
@@ -82,8 +82,8 @@ public Entity next() {
Utils.getFile(reader, tarEntry.getSize(), entity.sb);
// Strip outgoing triples from rdf:type statements
// SAM
- Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, null, true);
- //Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, true);
+ //Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, null, null, null, true);
+ Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, entity.label, entity.description, true);
/*
* incoming-triples.nt
*/
@@ -99,7 +99,7 @@ public Entity next() {
entity.inTuples.clear();
} else {
Utils.getFile(reader, tarEntry.getSize(), entity.sb);
- Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, false);
+ Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, null, null, false);
}
} while (hasNext(entityID)); // while documents describe the same entity
} catch (IOException e) {
diff --git a/src/main/java/org/sindice/siren/index/Utils.java b/src/main/java/org/sindice/siren/index/Utils.java
index 7257126..b6fb05d 100644
--- a/src/main/java/org/sindice/siren/index/Utils.java
+++ b/src/main/java/org/sindice/siren/index/Utils.java
@@ -40,6 +40,9 @@ public class Utils {
/* byte array used for reading the compressed tar files */
private static final ByteBuffer bbuffer = ByteBuffer.allocate(1024);
private static final String RDF_TYPE = "";
+ private static final String DC_DESCRIPTION = "";
+ private static final String DBP_ABSTRACT = "";
+ private static final String RDFS_LABEL = "";
private static final StringBuilder sb = new StringBuilder();
private static RDFParser parser = null;
private static StatementCollector collector = null;
@@ -106,8 +109,8 @@ private static final void toAsciiString(final StringBuilder data, final int leng
* @param types
* @param isOut
*/
- public static void sortAndFlattenNTriples(final StringBuilder triples, final HashMap> map, final HashSet types, final boolean isOut) {
- flattenNTriples(triples, map, types, isOut);
+ public static void sortAndFlattenNTriples(final StringBuilder triples, final HashMap> map, final HashSet types, final HashSet label, final HashSet description, final boolean isOut) {
+ flattenNTriples(triples, map, types, label, description, isOut);
}
private static void initParser() {
@@ -127,7 +130,7 @@ private static void initParser() {
* The list of n-triples.
* @return The n-tuples concatenated.
*/
- private static void flattenNTriples(final StringBuilder triples, final Map> map, final HashSet types, final boolean isOut) {
+ private static void flattenNTriples(final StringBuilder triples, final Map> map, final HashSet types, final HashSet label, final HashSet description, final boolean isOut) {
try {
initParser();
parser.parse(new StringReader(triples.toString()), "");
@@ -139,7 +142,13 @@ private static void flattenNTriples(final StringBuilder triples, final Map').toString()
: st.getObject().toString();
- if (types != null && predicate.equals(RDF_TYPE)) {
+ if (label != null && predicate.equals(RDFS_LABEL))
+ label.add(object);
+ if (description != null && predicate.equals(DC_DESCRIPTION))
+ description.add(object);
+ if (description != null && predicate.equals(DBP_ABSTRACT))
+ description.add(object);
+ if (types != null && predicate.equals(RDF_TYPE)) {
types.add(object);
} else {
HashSet hs = map.get(predicate);