diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f11b9e..d4247a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## 4.1.0
+ - bugfix: catch xpath-expression syntax error instead of impacting pipeline. Fix #19
+ - bugfix: report xml parsing error when parsing only with xpath and store_xml => false (using Nokogiri internally). Fix #10
+ - config: allow to create xpath-expression from event data using dynamic syntax
+ - config: fail at startup if store_xml => false and no xpath config specified, as the filter would do nothing
+ - internal: do not parse document with Nokogiri when xpath contains no expressions
+ - internal: restructure tests using contexts
+
## 4.0.0
- breaking,config: New configuration `suppress_empty`. Default to true change default behaviour of the plugin in favor of avoiding mapping conflicts when reaching elasticsearch
- config: New configuration `force_content`. By default the filter expands attributes differently from content in xml elements.
diff --git a/lib/logstash/filters/xml.rb b/lib/logstash/filters/xml.rb
index ba92bef..63eb27f 100644
--- a/lib/logstash/filters/xml.rb
+++ b/lib/logstash/filters/xml.rb
@@ -110,6 +110,15 @@ def register
:error => "When the 'store_xml' configuration option is true, 'target' must also be set"
)
end
+
+ if xpath.empty? && !@store_xml
+ raise LogStash::ConfigurationError, I18n.t(
+ "logstash.agent.configuration.invalid_plugin_register",
+ :plugin => "filter",
+ :type => "xml",
+ :error => "When the 'store_xml' configuration option is false, 'xpath' must contains elements"
+ )
+ end
end
def filter(event)
@@ -139,9 +148,10 @@ def filter(event)
# Do nothing with an empty string.
return if value.strip.empty?
- if @xpath
+ unless @xpath.empty?
begin
doc = Nokogiri::XML(value, nil, value.encoding.to_s)
+ raise doc.errors unless doc.errors.empty?
rescue => e
event.tag(XMLPARSEFAILURE_TAG)
@logger.warn("Error parsing xml", :source => @source, :value => value, :exception => e, :backtrace => e.backtrace)
@@ -149,8 +159,16 @@ def filter(event)
end
doc.remove_namespaces! if @remove_namespaces
- @xpath.each do |xpath_src, xpath_dest|
- nodeset = @namespaces.empty? ? doc.xpath(xpath_src) : doc.xpath(xpath_src, @namespaces)
+ @xpath.each do |xpath_syntax, xpath_dest|
+ xpath_src = event.sprintf(xpath_syntax)
+ begin
+ nodeset = @namespaces.empty? ? doc.xpath(xpath_src) : doc.xpath(xpath_src, @namespaces)
+ rescue Nokogiri::XML::XPath::SyntaxError => e
+ event.tag("_xpathsyntaxfailure")
+ @logger.warn("Trouble parsing the xpath expression", :source => @source, :value => value, :xpath => xpath_src,
+ :exception => e, :backtrace => e.backtrace)
+ return
+ end
# If asking xpath for a String, like "name(/*)", we get back a
# String instead of a NodeSet. We normalize that here.
diff --git a/logstash-filter-xml.gemspec b/logstash-filter-xml.gemspec
index a4d9869..42113d3 100644
--- a/logstash-filter-xml.gemspec
+++ b/logstash-filter-xml.gemspec
@@ -1,7 +1,7 @@
Gem::Specification.new do |s|
s.name = 'logstash-filter-xml'
- s.version = '4.0.0'
+ s.version = '4.1.0'
s.licenses = ['Apache License (2.0)']
s.summary = "Takes a field that contains XML and expands it into an actual datastructure."
s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program"
diff --git a/spec/filters/xml_spec.rb b/spec/filters/xml_spec.rb
index 4c73b30..7063adf 100644
--- a/spec/filters/xml_spec.rb
+++ b/spec/filters/xml_spec.rb
@@ -4,113 +4,93 @@
describe LogStash::Filters::Xml do
- describe "parse standard xml (Deprecated checks)" do
- config <<-CONFIG
- filter {
- xml {
- source => "raw"
- target => "data"
- }
- }
- CONFIG
+ # Common tests to validate behaviour of used xml parsing library
+ RSpec.shared_examples "report _xmlparsefailure" do |config_string|
- sample("raw" => '') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == {"key" => "value"}
- end
-
- #From parse xml with array as a value
- sample("raw" => 'value1value2') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == {"key" => ["value1", "value2"]}
- end
+ describe "parsing standard xml (Deprecated checks)" do
+ config(config_string)
- #From parse xml with hash as a value
- sample("raw" => 'value') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == {"key1" => [{"key2" => ["value"]}]}
- end
+ sample("xmldata" => '') do
+ insist { subject.get("tags") }.nil?
+ end
- # parse xml in single item array
- sample("raw" => [""]) do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == {"bar" => "baz"}
- end
+ #From parse xml with array as a value
+ sample("xmldata" => 'value1value2') do
+ insist { subject.get("tags") }.nil?
+ end
- # fail in multi items array
- sample("raw" => ["", "jojoba"]) do
- insist { subject.get("tags") }.include?("_xmlparsefailure")
- insist { subject.get("data")} == nil
- end
+ #From parse xml with hash as a value
+ sample("xmldata" => 'value') do
+ insist { subject.get("tags") }.nil?
+ end
- # fail in empty array
- sample("raw" => []) do
- insist { subject.get("tags") }.include?("_xmlparsefailure")
- insist { subject.get("data")} == nil
- end
+ # parse xml in single item array
+ sample("xmldata" => [""]) do
+ insist { subject.get("tags") }.nil?
+ end
- # fail for non string field
- sample("raw" => {"foo" => "bar"}) do
- insist { subject.get("tags") }.include?("_xmlparsefailure")
- insist { subject.get("data")} == nil
- end
+ # fail in multi items array
+ sample("xmldata" => ["", "jojoba"]) do
+ insist { subject.get("tags") }.include?("_xmlparsefailure")
+ insist { subject.get("data")} == nil
+ end
- # fail for non string single item array
- sample("raw" => [{"foo" => "bar"}]) do
- insist { subject.get("tags") }.include?("_xmlparsefailure")
- insist { subject.get("data")} == nil
- end
+ # fail in empty array
+ sample("xmldata" => []) do
+ insist { subject.get("tags") }.include?("_xmlparsefailure")
+ insist { subject.get("data")} == nil
+ end
- #From bad xml
- sample("raw" => ' {"foo" => "bar"}) do
+ insist { subject.get("tags") }.include?("_xmlparsefailure")
+ insist { subject.get("data")} == nil
+ end
- describe "parse standard xml but do not store (Deprecated checks)" do
- config <<-CONFIG
- filter {
- xml {
- source => "raw"
- target => "data"
- store_xml => false
- }
- }
- CONFIG
+ # fail for non string single item array
+ sample("xmldata" => [{"foo" => "bar"}]) do
+ insist { subject.get("tags") }.include?("_xmlparsefailure")
+ insist { subject.get("data")} == nil
+ end
- sample("raw" => '') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == nil
+ #From bad xml
+ sample("xmldata" => ' "raw"
- target => "data"
- xpath => [ "/foo/key/text()", "xpath_field" ]
- }
- }
- CONFIG
+ context "ensure xml parsers consistent report of _xmlparsefailure" do
- # Single value
- sample("raw" => 'value') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("xpath_field")} == ["value"]
+ #XMLSimple only, when no xpath
+ context "XMLSimple validation" do
+ config_string = <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ }
+ }
+ CONFIG
+ include_examples "report _xmlparsefailure", config_string
end
- #Multiple values
- sample("raw" => 'value1value2') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("xpath_field")} == ["value1","value2"]
+ #Nokogiri only when xpath exists and store_xml => false
+ context "Nokogiri validation" do
+ config_string = <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ store_xml => false
+ xpath => { "/" => nil }
+ }
+ }
+ CONFIG
+ include_examples "report _xmlparsefailure", config_string
end
end
- ## New tests
-
- describe "parse standard xml" do
+ context "parse standard xml" do
config <<-CONFIG
filter {
xml {
@@ -137,264 +117,305 @@
insist { subject.get("data") } == {"key1" => [{"key2" => ["value"]}]}
end
- #From bad xml
- sample("xmldata" => ' "xmldata"
+ target => "parseddata"
+ force_array => true
+ }
+ }
+ CONFIG
+
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("parseddata") } == { "bar" => ["Content"] }
+ end
+ end
+
+ describe "disabled" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "parseddata"
+ force_array => false
+ }
+ }
+ CONFIG
+
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("parseddata") } == { "bar" => "Content" }
+ end
+ end
+ end
- describe "parse standard xml but do not store" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "data"
- store_xml => false
- }
- }
- CONFIG
+ context "suppress_empty" do
+ describe "disabled" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ suppress_empty => false
+ }
+ }
+ CONFIG
+
+ sample("xmldata" => 'value1') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("data") } == {"key" => ["value1", {}]}
+ end
+ end
+
+ describe "enabled" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ suppress_empty => true
+ }
+ }
+ CONFIG
+
+ sample("xmldata" => 'value1') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("data") } == {"key" => ["value1"]}
+ end
+ end
+ end
- sample("xmldata" => '') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data")} == nil
+ context "force_content" do
+ describe "disabled" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ force_array => false
+ force_content => false
+ }
+ }
+ CONFIG
+
+ sample("xmldata" => 'text1text2') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("data") } == { 'x' => 'text1', 'y' => { 'a' => '2', 'content' => 'text2' } }
+ end
+ end
+ describe "enabled" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ force_array => false
+ force_content => true
+ }
+ }
+ CONFIG
+
+ sample("xmldata" => 'text1text2') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("data") } == { 'x' => { 'content' => 'text1' }, 'y' => { 'a' => '2', 'content' => 'text2' } }
+ end
+ end
+ end
end
end
- describe "parse xml and store values with xpath" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "data"
- xpath => [ "/foo/key/text()", "xpath_field" ]
- }
- }
- CONFIG
-
- # Single value
- sample("xmldata" => 'value') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("xpath_field") } == ["value"]
- end
-
- #Multiple values
- sample("xmldata" => 'value1value2') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("xpath_field") } == ["value1","value2"]
- end
- end
+ context "executing xpath query on the xml content and storing result in a field" do
+ context "standard usage" do
+ describe "parse xml and store values with xpath" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ store_xml => false
+ xpath => [ "/foo/key/text()", "xpath_field" ]
+ }
+ }
+ CONFIG
+
+ # Single value
+ sample("xmldata" => 'value') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("xpath_field") } == ["value"]
+ end
+
+ #Multiple values
+ sample("xmldata" => 'value1value2') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("xpath_field") } == ["value1","value2"]
+ end
+ end
- describe "parse correctly non ascii content with xpath" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "data"
- xpath => [ "/foo/key/text()", "xpath_field" ]
- }
- }
- CONFIG
+ describe "parse correctly non ascii content with xpath" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ store_xml => false
+ xpath => [ "/foo/key/text()", "xpath_field" ]
+ }
+ }
+ CONFIG
- # Single value
- sample("xmldata" => 'Français') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("xpath_field")} == ["Français"]
+ # Single value
+ sample("xmldata" => 'Français') do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("xpath_field")} == ["Français"]
+ end
+ end
end
- end
-
- describe "parse including namespaces" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- xpath => [ "/foo/h:div", "xpath_field" ]
- remove_namespaces => false
- store_xml => false
- }
- }
- CONFIG
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("xpath_field") } == ["Content"]
- end
- end
+ context "xpath expression configuration" do
- describe "parse including namespaces declarations on root" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- xpath => [ "/foo/h:div", "xpath_field" ]
- namespaces => {"h" => "http://www.w3.org/TR/html4/"}
- remove_namespaces => false
- store_xml => false
+ describe "report syntax error" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ xpath => [ "/foo/key/unknown-method()", "xpath_field" ]
+ }
}
- }
- CONFIG
+ CONFIG
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("xpath_field") } == ["Content"]
+ sample("xmldata" => 'Français') do
+ insist { subject.get("tags") } == ["_xpathsyntaxfailure"]
+ end
end
- end
- describe "parse including namespaces declarations on child" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- xpath => [ "/foo/h:div", "xpath_field" ]
- namespaces => {"h" => "http://www.w3.org/TR/html4/"}
- remove_namespaces => false
- store_xml => false
+ describe "retrieved from event field" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ target => "data"
+ xpath => [ "%{xpath_query}", "xpath_field" ]
+ }
}
- }
- CONFIG
+ CONFIG
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("xpath_field") } == ["Content"]
+ sample("xmldata" => 'Français', "xpath_query" => "/foo/key/text()") do
+ insist { subject.get("tags") }.nil?
+ insist { subject.get("xpath_field")} == ["Français"]
+ end
end
- end
-
- describe "parse removing namespaces" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- xpath => [ "/foo/div", "xpath_field" ]
- remove_namespaces => true
- store_xml => false
- }
- }
- CONFIG
-
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("xpath_field") } == ["Content
"]
end
- end
+ context "namespace handling" do
+ describe "parse including namespaces" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ xpath => [ "/foo/h:div", "xpath_field" ]
+ remove_namespaces => false
+ store_xml => false
+ }
+ }
+ CONFIG
- describe "parse with forcing array (Default)" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "parseddata"
- }
- }
- CONFIG
-
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("parseddata") } == { "bar" => ["Content"] }
- end
- end
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("xpath_field") } == ["Content"]
+ end
+ end
- describe "parse disabling forcing array" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "parseddata"
- force_array => false
- }
- }
- CONFIG
+ describe "parse including namespaces declarations on root" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ xpath => [ "/foo/h:div", "xpath_field" ]
+ namespaces => {"h" => "http://www.w3.org/TR/html4/"}
+ remove_namespaces => false
+ store_xml => false
+ }
+ }
+ CONFIG
- # Single value
- sample("xmldata" => 'Content') do
- insist { subject.get("parseddata") } == { "bar" => "Content" }
- end
- end
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("xpath_field") } == ["Content"]
+ end
+ end
- context "Using suppress_empty option" do
- describe "suppress_empty => false" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "data"
- suppress_empty => false
+ describe "parse including namespaces declarations on child" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ xpath => [ "/foo/h:div", "xpath_field" ]
+ namespaces => {"h" => "http://www.w3.org/TR/html4/"}
+ remove_namespaces => false
+ store_xml => false
+ }
}
- }
- CONFIG
+ CONFIG
- sample("xmldata" => 'value1') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data") } == {"key" => ["value1", {}]}
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("xpath_field") } == ["Content"]
+ end
end
- end
- describe "suppress_empty => true" do
- config <<-CONFIG
- filter {
- xml {
- source => "xmldata"
- target => "data"
- suppress_empty => true
+ describe "parse removing namespaces" do
+ config <<-CONFIG
+ filter {
+ xml {
+ source => "xmldata"
+ xpath => [ "/foo/div", "xpath_field" ]
+ remove_namespaces => true
+ store_xml => false
+ }
}
- }
- CONFIG
+ CONFIG
- sample("xmldata" => 'value1') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data") } == {"key" => ["value1"]}
+ # Single value
+ sample("xmldata" => 'Content') do
+ insist { subject.get("xpath_field") } == ["Content
"]
+ end
end
end
end
- context "Using force content option" do
- describe "force_content => false" do
+ context "plugin registration checks" do
+ describe "ensure target is set when store_xml => true" do
config <<-CONFIG
filter {
xml {
- source => "xmldata"
- target => "data"
- force_array => false
- force_content => false
+ xmldata => "message"
+ store_xml => true
}
}
CONFIG
- sample("xmldata" => 'text1text2') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data") } == { 'x' => 'text1', 'y' => { 'a' => '2', 'content' => 'text2' } }
+ sample("xmldata" => "random message") do
+ insist { subject }.raises(LogStash::ConfigurationError)
end
end
- describe "force_content => true" do
+
+ describe "ensure xpath is set when store_xml => false" do
config <<-CONFIG
filter {
xml {
source => "xmldata"
- target => "data"
- force_array => false
- force_content => true
+ store_xml => false
}
}
CONFIG
- sample("xmldata" => 'text1text2') do
- insist { subject.get("tags") }.nil?
- insist { subject.get("data") } == { 'x' => { 'content' => 'text1' }, 'y' => { 'a' => '2', 'content' => 'text2' } }
+ sample("xmldata" => "random message") do
+ insist { subject }.raises(LogStash::ConfigurationError)
end
end
end
-
- describe "plugin registration" do
- config <<-CONFIG
- filter {
- xml {
- xmldata => "message"
- store_xml => true
- }
- }
- CONFIG
-
- sample("xmldata" => "random message") do
- insist { subject }.raises(LogStash::ConfigurationError)
- end
- end
end