diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f11b9e..d4247a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 4.1.0 + - bugfix: catch xpath-expression syntax error instead of impacting pipeline. Fix #19 + - bugfix: report xml parsing error when parsing only with xpath and store_xml => false (using Nokogiri internally). Fix #10 + - config: allow to create xpath-expression from event data using dynamic syntax + - config: fail at startup if store_xml => false and no xpath config specified, as the filter would do nothing + - internal: do not parse document with Nokogiri when xpath contains no expressions + - internal: restructure tests using contexts + ## 4.0.0 - breaking,config: New configuration `suppress_empty`. Default to true change default behaviour of the plugin in favor of avoiding mapping conflicts when reaching elasticsearch - config: New configuration `force_content`. By default the filter expands attributes differently from content in xml elements. diff --git a/lib/logstash/filters/xml.rb b/lib/logstash/filters/xml.rb index ba92bef..63eb27f 100644 --- a/lib/logstash/filters/xml.rb +++ b/lib/logstash/filters/xml.rb @@ -110,6 +110,15 @@ def register :error => "When the 'store_xml' configuration option is true, 'target' must also be set" ) end + + if xpath.empty? && !@store_xml + raise LogStash::ConfigurationError, I18n.t( + "logstash.agent.configuration.invalid_plugin_register", + :plugin => "filter", + :type => "xml", + :error => "When the 'store_xml' configuration option is false, 'xpath' must contains elements" + ) + end end def filter(event) @@ -139,9 +148,10 @@ def filter(event) # Do nothing with an empty string. return if value.strip.empty? - if @xpath + unless @xpath.empty? begin doc = Nokogiri::XML(value, nil, value.encoding.to_s) + raise doc.errors unless doc.errors.empty? rescue => e event.tag(XMLPARSEFAILURE_TAG) @logger.warn("Error parsing xml", :source => @source, :value => value, :exception => e, :backtrace => e.backtrace) @@ -149,8 +159,16 @@ def filter(event) end doc.remove_namespaces! if @remove_namespaces - @xpath.each do |xpath_src, xpath_dest| - nodeset = @namespaces.empty? ? doc.xpath(xpath_src) : doc.xpath(xpath_src, @namespaces) + @xpath.each do |xpath_syntax, xpath_dest| + xpath_src = event.sprintf(xpath_syntax) + begin + nodeset = @namespaces.empty? ? doc.xpath(xpath_src) : doc.xpath(xpath_src, @namespaces) + rescue Nokogiri::XML::XPath::SyntaxError => e + event.tag("_xpathsyntaxfailure") + @logger.warn("Trouble parsing the xpath expression", :source => @source, :value => value, :xpath => xpath_src, + :exception => e, :backtrace => e.backtrace) + return + end # If asking xpath for a String, like "name(/*)", we get back a # String instead of a NodeSet. We normalize that here. diff --git a/logstash-filter-xml.gemspec b/logstash-filter-xml.gemspec index a4d9869..42113d3 100644 --- a/logstash-filter-xml.gemspec +++ b/logstash-filter-xml.gemspec @@ -1,7 +1,7 @@ Gem::Specification.new do |s| s.name = 'logstash-filter-xml' - s.version = '4.0.0' + s.version = '4.1.0' s.licenses = ['Apache License (2.0)'] s.summary = "Takes a field that contains XML and expands it into an actual datastructure." s.description = "This gem is a Logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/logstash-plugin install gemname. This gem is not a stand-alone program" diff --git a/spec/filters/xml_spec.rb b/spec/filters/xml_spec.rb index 4c73b30..7063adf 100644 --- a/spec/filters/xml_spec.rb +++ b/spec/filters/xml_spec.rb @@ -4,113 +4,93 @@ describe LogStash::Filters::Xml do - describe "parse standard xml (Deprecated checks)" do - config <<-CONFIG - filter { - xml { - source => "raw" - target => "data" - } - } - CONFIG + # Common tests to validate behaviour of used xml parsing library + RSpec.shared_examples "report _xmlparsefailure" do |config_string| - sample("raw" => '') do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == {"key" => "value"} - end - - #From parse xml with array as a value - sample("raw" => 'value1value2') do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == {"key" => ["value1", "value2"]} - end + describe "parsing standard xml (Deprecated checks)" do + config(config_string) - #From parse xml with hash as a value - sample("raw" => 'value') do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == {"key1" => [{"key2" => ["value"]}]} - end + sample("xmldata" => '') do + insist { subject.get("tags") }.nil? + end - # parse xml in single item array - sample("raw" => [""]) do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == {"bar" => "baz"} - end + #From parse xml with array as a value + sample("xmldata" => 'value1value2') do + insist { subject.get("tags") }.nil? + end - # fail in multi items array - sample("raw" => ["", "jojoba"]) do - insist { subject.get("tags") }.include?("_xmlparsefailure") - insist { subject.get("data")} == nil - end + #From parse xml with hash as a value + sample("xmldata" => 'value') do + insist { subject.get("tags") }.nil? + end - # fail in empty array - sample("raw" => []) do - insist { subject.get("tags") }.include?("_xmlparsefailure") - insist { subject.get("data")} == nil - end + # parse xml in single item array + sample("xmldata" => [""]) do + insist { subject.get("tags") }.nil? + end - # fail for non string field - sample("raw" => {"foo" => "bar"}) do - insist { subject.get("tags") }.include?("_xmlparsefailure") - insist { subject.get("data")} == nil - end + # fail in multi items array + sample("xmldata" => ["", "jojoba"]) do + insist { subject.get("tags") }.include?("_xmlparsefailure") + insist { subject.get("data")} == nil + end - # fail for non string single item array - sample("raw" => [{"foo" => "bar"}]) do - insist { subject.get("tags") }.include?("_xmlparsefailure") - insist { subject.get("data")} == nil - end + # fail in empty array + sample("xmldata" => []) do + insist { subject.get("tags") }.include?("_xmlparsefailure") + insist { subject.get("data")} == nil + end - #From bad xml - sample("raw" => ' {"foo" => "bar"}) do + insist { subject.get("tags") }.include?("_xmlparsefailure") + insist { subject.get("data")} == nil + end - describe "parse standard xml but do not store (Deprecated checks)" do - config <<-CONFIG - filter { - xml { - source => "raw" - target => "data" - store_xml => false - } - } - CONFIG + # fail for non string single item array + sample("xmldata" => [{"foo" => "bar"}]) do + insist { subject.get("tags") }.include?("_xmlparsefailure") + insist { subject.get("data")} == nil + end - sample("raw" => '') do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == nil + #From bad xml + sample("xmldata" => ' "raw" - target => "data" - xpath => [ "/foo/key/text()", "xpath_field" ] - } - } - CONFIG + context "ensure xml parsers consistent report of _xmlparsefailure" do - # Single value - sample("raw" => 'value') do - insist { subject.get("tags") }.nil? - insist { subject.get("xpath_field")} == ["value"] + #XMLSimple only, when no xpath + context "XMLSimple validation" do + config_string = <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + } + } + CONFIG + include_examples "report _xmlparsefailure", config_string end - #Multiple values - sample("raw" => 'value1value2') do - insist { subject.get("tags") }.nil? - insist { subject.get("xpath_field")} == ["value1","value2"] + #Nokogiri only when xpath exists and store_xml => false + context "Nokogiri validation" do + config_string = <<-CONFIG + filter { + xml { + source => "xmldata" + store_xml => false + xpath => { "/" => nil } + } + } + CONFIG + include_examples "report _xmlparsefailure", config_string end end - ## New tests - - describe "parse standard xml" do + context "parse standard xml" do config <<-CONFIG filter { xml { @@ -137,264 +117,305 @@ insist { subject.get("data") } == {"key1" => [{"key2" => ["value"]}]} end - #From bad xml - sample("xmldata" => ' "xmldata" + target => "parseddata" + force_array => true + } + } + CONFIG + + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("parseddata") } == { "bar" => ["Content"] } + end + end + + describe "disabled" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "parseddata" + force_array => false + } + } + CONFIG + + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("parseddata") } == { "bar" => "Content" } + end + end + end - describe "parse standard xml but do not store" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "data" - store_xml => false - } - } - CONFIG + context "suppress_empty" do + describe "disabled" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + suppress_empty => false + } + } + CONFIG + + sample("xmldata" => 'value1') do + insist { subject.get("tags") }.nil? + insist { subject.get("data") } == {"key" => ["value1", {}]} + end + end + + describe "enabled" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + suppress_empty => true + } + } + CONFIG + + sample("xmldata" => 'value1') do + insist { subject.get("tags") }.nil? + insist { subject.get("data") } == {"key" => ["value1"]} + end + end + end - sample("xmldata" => '') do - insist { subject.get("tags") }.nil? - insist { subject.get("data")} == nil + context "force_content" do + describe "disabled" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + force_array => false + force_content => false + } + } + CONFIG + + sample("xmldata" => 'text1text2') do + insist { subject.get("tags") }.nil? + insist { subject.get("data") } == { 'x' => 'text1', 'y' => { 'a' => '2', 'content' => 'text2' } } + end + end + describe "enabled" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + force_array => false + force_content => true + } + } + CONFIG + + sample("xmldata" => 'text1text2') do + insist { subject.get("tags") }.nil? + insist { subject.get("data") } == { 'x' => { 'content' => 'text1' }, 'y' => { 'a' => '2', 'content' => 'text2' } } + end + end + end end end - describe "parse xml and store values with xpath" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "data" - xpath => [ "/foo/key/text()", "xpath_field" ] - } - } - CONFIG - - # Single value - sample("xmldata" => 'value') do - insist { subject.get("tags") }.nil? - insist { subject.get("xpath_field") } == ["value"] - end - - #Multiple values - sample("xmldata" => 'value1value2') do - insist { subject.get("tags") }.nil? - insist { subject.get("xpath_field") } == ["value1","value2"] - end - end + context "executing xpath query on the xml content and storing result in a field" do + context "standard usage" do + describe "parse xml and store values with xpath" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + store_xml => false + xpath => [ "/foo/key/text()", "xpath_field" ] + } + } + CONFIG + + # Single value + sample("xmldata" => 'value') do + insist { subject.get("tags") }.nil? + insist { subject.get("xpath_field") } == ["value"] + end + + #Multiple values + sample("xmldata" => 'value1value2') do + insist { subject.get("tags") }.nil? + insist { subject.get("xpath_field") } == ["value1","value2"] + end + end - describe "parse correctly non ascii content with xpath" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "data" - xpath => [ "/foo/key/text()", "xpath_field" ] - } - } - CONFIG + describe "parse correctly non ascii content with xpath" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + store_xml => false + xpath => [ "/foo/key/text()", "xpath_field" ] + } + } + CONFIG - # Single value - sample("xmldata" => 'Français') do - insist { subject.get("tags") }.nil? - insist { subject.get("xpath_field")} == ["Français"] + # Single value + sample("xmldata" => 'Français') do + insist { subject.get("tags") }.nil? + insist { subject.get("xpath_field")} == ["Français"] + end + end end - end - - describe "parse including namespaces" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - xpath => [ "/foo/h:div", "xpath_field" ] - remove_namespaces => false - store_xml => false - } - } - CONFIG - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("xpath_field") } == ["Content"] - end - end + context "xpath expression configuration" do - describe "parse including namespaces declarations on root" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - xpath => [ "/foo/h:div", "xpath_field" ] - namespaces => {"h" => "http://www.w3.org/TR/html4/"} - remove_namespaces => false - store_xml => false + describe "report syntax error" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + xpath => [ "/foo/key/unknown-method()", "xpath_field" ] + } } - } - CONFIG + CONFIG - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("xpath_field") } == ["Content"] + sample("xmldata" => 'Français') do + insist { subject.get("tags") } == ["_xpathsyntaxfailure"] + end end - end - describe "parse including namespaces declarations on child" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - xpath => [ "/foo/h:div", "xpath_field" ] - namespaces => {"h" => "http://www.w3.org/TR/html4/"} - remove_namespaces => false - store_xml => false + describe "retrieved from event field" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + target => "data" + xpath => [ "%{xpath_query}", "xpath_field" ] + } } - } - CONFIG + CONFIG - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("xpath_field") } == ["Content"] + sample("xmldata" => 'Français', "xpath_query" => "/foo/key/text()") do + insist { subject.get("tags") }.nil? + insist { subject.get("xpath_field")} == ["Français"] + end end - end - - describe "parse removing namespaces" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - xpath => [ "/foo/div", "xpath_field" ] - remove_namespaces => true - store_xml => false - } - } - CONFIG - - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("xpath_field") } == ["
Content
"] end - end + context "namespace handling" do + describe "parse including namespaces" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + xpath => [ "/foo/h:div", "xpath_field" ] + remove_namespaces => false + store_xml => false + } + } + CONFIG - describe "parse with forcing array (Default)" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "parseddata" - } - } - CONFIG - - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("parseddata") } == { "bar" => ["Content"] } - end - end + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("xpath_field") } == ["Content"] + end + end - describe "parse disabling forcing array" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "parseddata" - force_array => false - } - } - CONFIG + describe "parse including namespaces declarations on root" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + xpath => [ "/foo/h:div", "xpath_field" ] + namespaces => {"h" => "http://www.w3.org/TR/html4/"} + remove_namespaces => false + store_xml => false + } + } + CONFIG - # Single value - sample("xmldata" => 'Content') do - insist { subject.get("parseddata") } == { "bar" => "Content" } - end - end + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("xpath_field") } == ["Content"] + end + end - context "Using suppress_empty option" do - describe "suppress_empty => false" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "data" - suppress_empty => false + describe "parse including namespaces declarations on child" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + xpath => [ "/foo/h:div", "xpath_field" ] + namespaces => {"h" => "http://www.w3.org/TR/html4/"} + remove_namespaces => false + store_xml => false + } } - } - CONFIG + CONFIG - sample("xmldata" => 'value1') do - insist { subject.get("tags") }.nil? - insist { subject.get("data") } == {"key" => ["value1", {}]} + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("xpath_field") } == ["Content"] + end end - end - describe "suppress_empty => true" do - config <<-CONFIG - filter { - xml { - source => "xmldata" - target => "data" - suppress_empty => true + describe "parse removing namespaces" do + config <<-CONFIG + filter { + xml { + source => "xmldata" + xpath => [ "/foo/div", "xpath_field" ] + remove_namespaces => true + store_xml => false + } } - } - CONFIG + CONFIG - sample("xmldata" => 'value1') do - insist { subject.get("tags") }.nil? - insist { subject.get("data") } == {"key" => ["value1"]} + # Single value + sample("xmldata" => 'Content') do + insist { subject.get("xpath_field") } == ["
Content
"] + end end end end - context "Using force content option" do - describe "force_content => false" do + context "plugin registration checks" do + describe "ensure target is set when store_xml => true" do config <<-CONFIG filter { xml { - source => "xmldata" - target => "data" - force_array => false - force_content => false + xmldata => "message" + store_xml => true } } CONFIG - sample("xmldata" => 'text1text2') do - insist { subject.get("tags") }.nil? - insist { subject.get("data") } == { 'x' => 'text1', 'y' => { 'a' => '2', 'content' => 'text2' } } + sample("xmldata" => "random message") do + insist { subject }.raises(LogStash::ConfigurationError) end end - describe "force_content => true" do + + describe "ensure xpath is set when store_xml => false" do config <<-CONFIG filter { xml { source => "xmldata" - target => "data" - force_array => false - force_content => true + store_xml => false } } CONFIG - sample("xmldata" => 'text1text2') do - insist { subject.get("tags") }.nil? - insist { subject.get("data") } == { 'x' => { 'content' => 'text1' }, 'y' => { 'a' => '2', 'content' => 'text2' } } + sample("xmldata" => "random message") do + insist { subject }.raises(LogStash::ConfigurationError) end end end - - describe "plugin registration" do - config <<-CONFIG - filter { - xml { - xmldata => "message" - store_xml => true - } - } - CONFIG - - sample("xmldata" => "random message") do - insist { subject }.raises(LogStash::ConfigurationError) - end - end end