Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
<module>add-language-of-descriptions</module>
<module>identify-main-identifier</module>
<module>leave-only-one-identifier</module>
<module>relativize-identifiers</module>
</modules>
<dependencyManagement>
<dependencies>
Expand Down
10 changes: 10 additions & 0 deletions relativize-identifiers/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
relativize-identifiers
=========================

Part of the ehri-ead-preprocessing tools to normalise EAD files before importing into the EHRI database.

precondition: The EAD file has absolute identifiers, where unitids in each c-level include the full ID of their parent unitid
postcondition: The EAD file has relative identifiers.

usage:
java -jar relativize-identifiers/target/relativize-identifier-1.0-SNAPSHOT-jar-with-dependencies.jar <ead.xml>
72 changes: 72 additions & 0 deletions relativize-identifiers/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
<?xml version="1.0"?>
<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>ehri-project-preprocess</groupId>
<artifactId>ead-preprocessing</artifactId>
<version>1.0</version>
</parent>
<groupId>ehri-project</groupId>
<artifactId>relativize-identifiers</artifactId>
<version>1.0-SNAPSHOT</version>
<name>relativize-identifiers</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>

<!-- STAX parser -->
<dependency>
<groupId>stax</groupId>
<artifactId>stax</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>stax</groupId>
<artifactId>stax-api</artifactId>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>eu.ehri.relativize_identifiers.RelativizeIdentifiers</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- bind to the packaging phase -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package eu.ehri.relativize_identifiers;

import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;

import java.util.Stack;
import java.util.regex.Pattern;
import javax.xml.parsers.FactoryConfigurationError;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.XMLEvent;


public class RelativizeIdentifiers {

public final static String SUFFIX = "_relid.xml";

static XMLEventFactory eventFactory = XMLEventFactory.newInstance();
static XMLOutputFactory factory = XMLOutputFactory.newInstance();


public static void main(String[] args) throws XMLStreamException, javax.xml.stream.FactoryConfigurationError, IOException {
String eadfile = args[0];
String outputfile = eadfile.replace(".xml", SUFFIX);
RelativizeIdentifiers.relativizeIdentifiers(eadfile, new FileWriter(outputfile));
}

/**
* precondition: The EAD file has absolute identifiers, where unitids in each c-level
* include the full ID of their parent unitid
* postcondition: The EAD file has relative identifiers.
*
* @param eadfile the name of the ead file
* @throws javax.xml.stream.XMLStreamException
* @throws javax.xml.parsers.FactoryConfigurationError
* @throws java.io.IOException
*/
public static String relativizeIdentifiers(String eadfile, Writer outputWriter)
throws XMLStreamException, FactoryConfigurationError, IOException {

FileInputStream fileInputStreamEAD = new FileInputStream(eadfile);
XMLEventWriter writer = factory.createXMLEventWriter(outputWriter);

Stack<String> idStack = new Stack<String>();
Pattern childPattern = Pattern.compile("c\\d\\d");

String thisId;

XMLEventReader xmlEventReaderEAD = XMLInputFactory.newInstance().createXMLEventReader(fileInputStreamEAD);
while (xmlEventReaderEAD.hasNext()) {
XMLEvent event = xmlEventReaderEAD.nextEvent();
writer.add(event);
if (event.isStartElement()) {
if (event.asStartElement().getName().getLocalPart().equals("unitid")) {
XMLEvent nextEvent = xmlEventReaderEAD.nextEvent();
if (nextEvent.isCharacters()) {
thisId = nextEvent.asCharacters().getData();
if (!idStack.empty() && thisId.contains(idStack.peek())) {
// Replace the ID and any non-ID trailing chars, such as spaces,
// colons, or dashes.
String regex = "^" + Pattern.quote(idStack.peek()) + "[\\s\\-:_\\/]*";
String newId = thisId.replaceFirst(regex, "");
Characters chars = eventFactory.createCharacters(newId);
writer.add(chars);
} else {
writer.add(nextEvent);
}
idStack.push(thisId);
}
}
} else if (event.isEndElement()) {
if (event.asEndElement().getName().getLocalPart()
.matches(childPattern.pattern())) {
idStack.pop();
}
}
}

writer.close();
xmlEventReaderEAD.close();
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package eu.ehri.relativize_identifiers;

import org.junit.Before;
import org.junit.Test;
import org.w3c.dom.Document;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.transform.dom.DOMSource;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import java.io.*;
import java.net.URISyntaxException;
import java.net.URL;

import static org.junit.Assert.assertEquals;

/**
* @author Mike Bryant (http://github.com/mikesname)
*/
public class RelativizeIdentifiersTest {

DocumentBuilder builder;
XPath xpath;

@Before
public void setUp() throws Exception {
builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
xpath = XPathFactory.newInstance().newXPath();
}

@Test
public void testRelativizeIdentifiersWithSpaces() throws Exception {
Document outDoc = getOutputDocument("/absoluteids-spaces.xml");
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc));
assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc));
assertEquals("2 root 1 2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03[2]/did/unitid").evaluate(outDoc));
}

@Test
public void testRelativizeIdentifiersWithHyphens() throws Exception {
Document outDoc = getOutputDocument("/absoluteids-hyphens.xml");
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc));
assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc));
}

@Test
public void testRelativizeIdentifiersWithSlashes() throws Exception {
Document outDoc = getOutputDocument("/wp2_jmp_ead.xml");
assertEquals("COLLECTION.JMP.SHOAH/T", xpath.compile("/ead/archdesc/did/unitid").evaluate(outDoc));
assertEquals("2", xpath.compile("/ead/archdesc/dsc/c01[1]/did/unitid").evaluate(outDoc));
assertEquals("A", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/did/unitid").evaluate(outDoc));
assertEquals("1", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/did/unitid").evaluate(outDoc));
assertEquals("a", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/c04[1]/did/unitid").evaluate(outDoc));
assertEquals("028", xpath.compile("/ead/archdesc/dsc/c01[1]/c02[1]/c03[1]/c04[1]/c05[1]/did/unitid").evaluate
(outDoc));
}

@Test
public void testRelativizeIdentifiersAlreadyRelative() throws Exception {
Document outDoc = getOutputDocument("/relativeids.xml");
assertEquals("c1", xpath.compile("/ead/archdesc/dsc/c01/did/unitid").evaluate(outDoc));
assertEquals("c2-1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/did/unitid").evaluate(outDoc));
assertEquals("c3-1", xpath.compile("/ead/archdesc/dsc/c01/c02[1]/c03/did/unitid").evaluate(outDoc));
assertEquals("c2-2", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/did/unitid").evaluate(outDoc));
assertEquals("c3-1", xpath.compile("/ead/archdesc/dsc/c01/c02[2]/c03/did/unitid").evaluate(outDoc));
}

private Document getOutputDocument(String resourceName) throws URISyntaxException, XMLStreamException,
IOException, SAXException {
URL resource = RelativizeIdentifiersTest.class.getResource(resourceName);
String path = new File(resource.toURI()).getAbsolutePath();
StringWriter stringWriter = new StringWriter();
RelativizeIdentifiers.relativizeIdentifiers(path, stringWriter);
stringWriter.close();
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(stringWriter.getBuffer().toString().getBytes());
return builder.parse(byteArrayInputStream);
}
}
37 changes: 37 additions & 0 deletions relativize-identifiers/src/test/resources/absoluteids-hyphens.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<ead>
<eadheader>

</eadheader>
<archdesc>
<did>
<unitid>root</unitid>
</did>
<dsc type="combined">
<c01>
<did>
<unitid>root-1</unitid>
</did>
<c02>
<did>
<unitid>root-1-1</unitid>
</did>
<c03>
<did>
<unitid>root-1-1-1</unitid>
</did>
</c03>
</c02>
<c02>
<did>
<unitid>root-1-2</unitid>
</did>
<c03>
<did>
<unitid>root-1-2-1</unitid>
</did>
</c03>
</c02>
</c01>
</dsc>
</archdesc>
</ead>
44 changes: 44 additions & 0 deletions relativize-identifiers/src/test/resources/absoluteids-spaces.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<ead>
<eadheader>

</eadheader>
<archdesc>
<did>
<unitid>root</unitid>
</did>
<dsc type="combined">
<c01>
<did>
<unitid>root 1</unitid>
</did>
<c02>
<did>
<unitid>root 1 1</unitid>
</did>
<c03>
<did>
<unitid>root 1 1 1</unitid>
</did>
</c03>
</c02>
<c02>
<did>
<unitid>root 1 2</unitid>
</did>
<c03>
<did>
<unitid>root 1 2 1</unitid>
</did>
</c03>
<c03>
<did>
<!-- Ensure an ID with it's parent ID repeated somewhere
other than the start doesn't get munged. -->
<unitid>root 1 2 2 root 1 2</unitid>
</did>
</c03>
</c02>
</c01>
</dsc>
</archdesc>
</ead>
37 changes: 37 additions & 0 deletions relativize-identifiers/src/test/resources/relativeids.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<ead>
<eadheader>

</eadheader>
<archdesc>
<did>
<unitid>root</unitid>
</did>
<dsc type="combined">
<c01>
<did>
<unitid>c1</unitid>
</did>
<c02>
<did>
<unitid>c2-1</unitid>
</did>
<c03>
<did>
<unitid>c3-1</unitid>
</did>
</c03>
</c02>
<c02>
<did>
<unitid>c2-2</unitid>
</did>
<c03>
<did>
<unitid>c3-1</unitid>
</did>
</c03>
</c02>
</c01>
</dsc>
</archdesc>
</ead>
Loading