diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c043303 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target +target/* diff --git a/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar b/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar new file mode 100644 index 0000000..c706985 Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-model/2.2.4/sesame-model-2.2.4.jar differ diff --git a/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar new file mode 100644 index 0000000..ac4a82f Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-rio-api/2.2.4/sesame-rio-api-2.2.4.jar differ diff --git a/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar b/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar new file mode 100644 index 0000000..80d1ef3 Binary files /dev/null and b/lib/org/openrdf/sesame/sesame-rio-ntriples/2.2.4/sesame-rio-ntriples-2.2.4.jar differ diff --git a/pom.xml b/pom.xml index 3a119bd..392a448 100644 --- a/pom.xml +++ b/pom.xml @@ -37,31 +37,40 @@ maven-compiler-plugin 2.5.1 - 1.7 - 1.7 + 1.6 + 1.6 + + + local + Local repository in project tree + file:${basedir}/lib + + + + org.openrdf.sesame sesame-model - 2.2.4 + 2.7.5 org.openrdf.sesame sesame-rio-api - 2.2.4 + 2.7.5 org.openrdf.sesame sesame-rio-ntriples - 2.2.4 + 2.7.5 @@ -75,6 +84,13 @@ commons-compress 1.0 + + + com.jcraft + jzlib + 1.1.2 + + net.sf.jopt-simple diff --git a/pom.xml~ b/pom.xml~ new file mode 100644 index 0000000..c4e2d76 --- /dev/null +++ b/pom.xml~ @@ -0,0 +1,151 @@ + + + 4.0.0 + + be.ugent.mmlab + siren-eostool + 0.2-SNAPSHOT + eic-entity-tool + + + + + scampi + Stephane Campinas + stephane.campinas::at::deri.org + DERI Galway + http://www.deri.ie/ + + Developer + + + + + + + + maven-assembly-plugin + + + assembly.xml + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.5.1 + + 1.7 + 1.6 + + + + + + + + local + Local repository in project tree + file:${basedir}/lib + + + + + + + + org.openrdf.sesame + sesame-model + 2.2.4 + + + + org.openrdf.sesame + sesame-rio-api + 2.2.4 + + + + org.openrdf.sesame + sesame-rio-ntriples + 2.2.4 + + + + org.sindice.siren + siren-core + 0.2.1-SNAPSHOT + + + + org.apache.commons + commons-compress + 1.0 + + + + net.sf.jopt-simple + jopt-simple + 3.2-rc1 + + + + junit + junit + 4.8.1 + test + + + + ch.qos.logback + logback-classic + 0.9.28 + + + + org.apache.lucene + lucene-core + 3.1.0 + + + + commons-lang + commons-lang + 2.3 + + + + org.apache.solr + solr-solrj + 3.5.0 + + + + commons-logging + commons-logging + 1.1.1 + + + + commons-codec + commons-codec + 1.5 + + + + org.sindice.siren + siren-core + 0.2.3-RC2 + + + + org.sindice.siren + siren-solr + 0.2.3-RC2 + + + + + diff --git a/required_siren_schema.xml b/required_siren_schema.xml new file mode 100644 index 0000000..70a477f --- /dev/null +++ b/required_siren_schema.xml @@ -0,0 +1,183 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + ntriple + + + + + + + + + + + diff --git a/src/main/java/org/sindice/siren/index/Entity.java b/src/main/java/org/sindice/siren/index/Entity.java index c3370de..463b171 100644 --- a/src/main/java/org/sindice/siren/index/Entity.java +++ b/src/main/java/org/sindice/siren/index/Entity.java @@ -38,6 +38,8 @@ public class Entity { final StringBuilder sbMetadata = new StringBuilder(); /* rdf:type statement's objects */ final HashSet type = new HashSet(); + final HashSet label = new HashSet(); + final HashSet description = new HashSet(); final StringBuilder sb = new StringBuilder(); @@ -51,6 +53,8 @@ public void clear() { outTuples.clear(); sb.setLength(0); type.clear(); + label.clear(); + description.clear(); sbMetadata.setLength(0); } @@ -60,8 +64,11 @@ public String getTriples(boolean out) { sb.setLength(0); for (Entry> e : map.entrySet()) { for (String s : e.getValue()){ - sb.append(subject).append(' ').append(e.getKey()).append(' ').append(s).append(" .\n"); - } + if ( (subject.contains("<") && subject.contains(">")) || subject.indexOf('_') == 0) + sb.append(subject).append(' ').append(e.getKey()).append(' ').append(s).append(" .\n"); + else + sb.append('<').append(subject).append('>').append(' ').append(e.getKey()).append(' ').append(s).append(" .\n"); + } } return sb.toString(); } diff --git a/src/main/java/org/sindice/siren/index/IndexingMMLab.java b/src/main/java/org/sindice/siren/index/IndexingMMLab.java index a3bf3c9..a84e61b 100644 --- a/src/main/java/org/sindice/siren/index/IndexingMMLab.java +++ b/src/main/java/org/sindice/siren/index/IndexingMMLab.java @@ -13,7 +13,7 @@ import java.io.IOException; import java.util.Arrays; import java.util.Iterator; -import java.util.zip.GZIPInputStream; +//import java.util.zip.GZIPInputStream; import org.apache.commons.lang.StringUtils; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; @@ -22,11 +22,14 @@ import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; +import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.common.SolrInputDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.jcraft.jzlib.*; + /** * Index a list of entities, creating incoming, outgoing triples fields, subject * and type fields. The type field is a grouping of the rdf:type objects for this @@ -50,6 +53,8 @@ public abstract class IndexingMMLab implements Iterator { final static public String URL = "url"; final static public String NTRIPLE = "ntriple"; final static public String TYPE = "type"; + final static public String LABEL = "label"; + final static public String DESCRIPTION = "description"; /* The dataset files */ protected final File[] input; @@ -98,7 +103,7 @@ public boolean accept(File dir, String name) { this.indexURL = url; - server = new CommonsHttpSolrServer(indexURL); + server = new StreamingUpdateSolrServer(indexURL, COMMIT, 32); // Clear the index if (CLEAR){ clear(); @@ -183,11 +188,15 @@ public void indexIt() final SolrInputDocument document = new SolrInputDocument(); document.addField(URL, StringUtils.strip(entity.subject, "<>")); document.addField(NTRIPLE, cleanup(entity.getTriples(true))); - //document.addField(TYPE, Utils.toString(entity.type)); - - add(document); - - counter = commit(true, counter, entity.subject); + document.addField(TYPE, Utils.toString(entity.type)); + document.addField(LABEL, Utils.toString(entity.label)); + document.addField(DESCRIPTION, Utils.toString(entity.description)); + try { + add(document); + counter = commit(true, counter, entity.subject); + } catch (Exception e) { + logger.error("Error while processing the document: {}", e); + } } commit(false, counter, entity.subject); // Commit what is left } diff --git a/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java b/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java index 841cc35..59d4524 100644 --- a/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java +++ b/src/main/java/org/sindice/siren/index/SindiceDEIndexing.java @@ -91,7 +91,7 @@ public Entity next() { entityByteSize += tarEntry.getSize(); Utils.getFile(reader, tarEntry.getSize(), entity.sb); // Strip outgoing triples from rdf:type statements - Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, true); + Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, entity.label, entity.description, true); /* * incoming-triples.nt */ @@ -107,7 +107,7 @@ public Entity next() { entity.inTuples.clear(); } else { Utils.getFile(reader, tarEntry.getSize(), entity.sb); - Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, false); + Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, null, null, false); } } while (hasNext(entityID)); // while documents describe the same entity } catch (IOException e) { diff --git a/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java b/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java index 621ef4e..b9ea142 100644 --- a/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java +++ b/src/main/java/org/sindice/siren/index/SindiceEDIndexing.java @@ -82,8 +82,8 @@ public Entity next() { Utils.getFile(reader, tarEntry.getSize(), entity.sb); // Strip outgoing triples from rdf:type statements // SAM - Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, null, true); - //Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, true); + //Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, null, null, null, true); + Utils.sortAndFlattenNTriples(entity.sb, entity.outTuples, entity.type, entity.label, entity.description, true); /* * incoming-triples.nt */ @@ -99,7 +99,7 @@ public Entity next() { entity.inTuples.clear(); } else { Utils.getFile(reader, tarEntry.getSize(), entity.sb); - Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, false); + Utils.sortAndFlattenNTriples(entity.sb, entity.inTuples, null, null, null, false); } } while (hasNext(entityID)); // while documents describe the same entity } catch (IOException e) { diff --git a/src/main/java/org/sindice/siren/index/Utils.java b/src/main/java/org/sindice/siren/index/Utils.java index 7257126..b6fb05d 100644 --- a/src/main/java/org/sindice/siren/index/Utils.java +++ b/src/main/java/org/sindice/siren/index/Utils.java @@ -40,6 +40,9 @@ public class Utils { /* byte array used for reading the compressed tar files */ private static final ByteBuffer bbuffer = ByteBuffer.allocate(1024); private static final String RDF_TYPE = ""; + private static final String DC_DESCRIPTION = ""; + private static final String DBP_ABSTRACT = ""; + private static final String RDFS_LABEL = ""; private static final StringBuilder sb = new StringBuilder(); private static RDFParser parser = null; private static StatementCollector collector = null; @@ -106,8 +109,8 @@ private static final void toAsciiString(final StringBuilder data, final int leng * @param types * @param isOut */ - public static void sortAndFlattenNTriples(final StringBuilder triples, final HashMap> map, final HashSet types, final boolean isOut) { - flattenNTriples(triples, map, types, isOut); + public static void sortAndFlattenNTriples(final StringBuilder triples, final HashMap> map, final HashSet types, final HashSet label, final HashSet description, final boolean isOut) { + flattenNTriples(triples, map, types, label, description, isOut); } private static void initParser() { @@ -127,7 +130,7 @@ private static void initParser() { * The list of n-triples. * @return The n-tuples concatenated. */ - private static void flattenNTriples(final StringBuilder triples, final Map> map, final HashSet types, final boolean isOut) { + private static void flattenNTriples(final StringBuilder triples, final Map> map, final HashSet types, final HashSet label, final HashSet description, final boolean isOut) { try { initParser(); parser.parse(new StringReader(triples.toString()), ""); @@ -139,7 +142,13 @@ private static void flattenNTriples(final StringBuilder triples, final Map').toString() : st.getObject().toString(); - if (types != null && predicate.equals(RDF_TYPE)) { + if (label != null && predicate.equals(RDFS_LABEL)) + label.add(object); + if (description != null && predicate.equals(DC_DESCRIPTION)) + description.add(object); + if (description != null && predicate.equals(DBP_ABSTRACT)) + description.add(object); + if (types != null && predicate.equals(RDF_TYPE)) { types.add(object); } else { HashSet hs = map.get(predicate);