|
| 1 | +// Copyright 2025 The CUE Authors |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +// Package koala converts XML to and from CUE, as proposed in https://cuelang.org/discussion/3776. |
| 16 | +// |
| 17 | +// This encoding is inspired by the [BadgerFish] convention for translating XML to JSON. |
| 18 | +// XML elements are modeled as CUE structs, their attributes are modeled as struct fields |
| 19 | +// prefixed with "$", and their inner text content is modeleed as a field named "$$". |
| 20 | +// |
| 21 | +// WARNING: THIS PACKAGE IS EXPERIMENTAL. |
| 22 | +// ITS API MAY CHANGE AT ANY TIME. |
| 23 | +// |
| 24 | +// [BadgerFish]: http://www.sklar.com/badgerfish/ |
| 25 | +package koala |
| 26 | + |
| 27 | +import ( |
| 28 | + "bytes" |
| 29 | + "encoding/xml" |
| 30 | + "fmt" |
| 31 | + "io" |
| 32 | + "strings" |
| 33 | + "unicode" |
| 34 | + |
| 35 | + "cuelang.org/go/cue/ast" |
| 36 | + "cuelang.org/go/cue/token" |
| 37 | +) |
| 38 | + |
| 39 | +// Decoder implements the decoding state. |
| 40 | +type Decoder struct { |
| 41 | + reader io.Reader |
| 42 | + fileName string |
| 43 | + tokenFile *token.File |
| 44 | + |
| 45 | + decoderRan bool |
| 46 | + |
| 47 | + // current XML element being processed. |
| 48 | + currXmlElement *xmlElement |
| 49 | + |
| 50 | + // The top-level CUE struct. |
| 51 | + astRoot *ast.StructLit |
| 52 | + // CUE model of ancestors of current XML element being processed. |
| 53 | + ancestors []currFieldInfo |
| 54 | + // CUE model of current XML element being processed. |
| 55 | + currField currFieldInfo |
| 56 | + // CUE model of current XML element's inner content ($$ attribute). |
| 57 | + currInnerText *ast.Field |
| 58 | +} |
| 59 | + |
| 60 | +// currFieldInfo encapsulates details of the CUE field for the current XML element being processed. |
| 61 | +type currFieldInfo struct { |
| 62 | + // CUE model of current XML element. |
| 63 | + field *ast.Field |
| 64 | + // Running map of the current field's children. |
| 65 | + currFieldChildren map[string]*ast.Field |
| 66 | +} |
| 67 | + |
| 68 | +// xmlElement models an XML Element hierarchy. |
| 69 | +// It is used for tracking namespace prefixes. |
| 70 | +type xmlElement struct { |
| 71 | + xmlName xml.Name |
| 72 | + attr []xml.Attr |
| 73 | + parent *xmlElement |
| 74 | + children []*xmlElement |
| 75 | + textContentIsWhiteSpace bool |
| 76 | +} |
| 77 | + |
| 78 | +// The prefix used to model the inner text content within an XML element. |
| 79 | +const contentAttribute string = "$$" |
| 80 | + |
| 81 | +// The prefix used to model each attribute of an XML element. |
| 82 | +const attributeSymbol string = "$" |
| 83 | + |
| 84 | +// NewDecoder creates a decoder from a stream of XML input. |
| 85 | +func NewDecoder(fileName string, r io.Reader) *Decoder { |
| 86 | + return &Decoder{reader: r, fileName: fileName} |
| 87 | +} |
| 88 | + |
| 89 | +// Decode parses the input stream as XML and converts it to a CUE [ast.Expr]. |
| 90 | +// The input stream is taken from the [Decoder] and consumed. |
| 91 | +func (dec *Decoder) Decode() (ast.Expr, error) { |
| 92 | + if dec.decoderRan { |
| 93 | + return nil, io.EOF |
| 94 | + } |
| 95 | + dec.decoderRan = true |
| 96 | + xmlText, err := io.ReadAll(dec.reader) |
| 97 | + if err != nil { |
| 98 | + return nil, err |
| 99 | + } |
| 100 | + reader := bytes.NewReader(xmlText) |
| 101 | + xmlDec := xml.NewDecoder(reader) |
| 102 | + |
| 103 | + // Create a token file to track the position of the XML content in the CUE file. |
| 104 | + dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText)) |
| 105 | + dec.tokenFile.SetLinesForContent(xmlText) |
| 106 | + |
| 107 | + for { |
| 108 | + startOffset := xmlDec.InputOffset() |
| 109 | + t, err := xmlDec.Token() |
| 110 | + if err == io.EOF { |
| 111 | + break |
| 112 | + } |
| 113 | + if err != nil { |
| 114 | + return nil, err |
| 115 | + } |
| 116 | + switch xmlToken := t.(type) { |
| 117 | + case xml.StartElement: |
| 118 | + err = dec.decodeStartElement(xmlToken, startOffset) |
| 119 | + case xml.CharData: |
| 120 | + err = dec.decoderInnerText(xmlToken, startOffset) |
| 121 | + case xml.EndElement: |
| 122 | + err = dec.decodeEndElement() |
| 123 | + } |
| 124 | + if err != nil { |
| 125 | + return nil, err |
| 126 | + } |
| 127 | + // If the XML document has ended, break out of the loop. |
| 128 | + if dec.astRoot != nil && dec.currXmlElement == nil { |
| 129 | + break |
| 130 | + } |
| 131 | + } |
| 132 | + return dec.astRoot, nil |
| 133 | +} |
| 134 | + |
| 135 | +func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error { |
| 136 | + // If this is text content within an XML element. |
| 137 | + textContent := string(xml.CharData(xmlToken)) |
| 138 | + if dec.currField.field == nil { |
| 139 | + if isWhiteSpace(textContent) { |
| 140 | + return nil |
| 141 | + } |
| 142 | + return fmt.Errorf("text content outside of an XML element is not supported") |
| 143 | + } |
| 144 | + pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos) |
| 145 | + txtContentPosition := pos |
| 146 | + txtLabel := ast.NewString(contentAttribute) |
| 147 | + txtLabel.ValuePos = txtContentPosition |
| 148 | + val := toBasicLit(textContent) |
| 149 | + val.ValuePos = txtContentPosition |
| 150 | + textContentNode := &ast.Field{ |
| 151 | + Label: txtLabel, |
| 152 | + Value: val, |
| 153 | + TokenPos: pos, |
| 154 | + } |
| 155 | + dec.currInnerText = textContentNode |
| 156 | + dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent) |
| 157 | + return nil |
| 158 | +} |
| 159 | + |
| 160 | +func (dec *Decoder) decodeEndElement() error { |
| 161 | + // If there is text content within the element, add it to the element's value. |
| 162 | + if dec.currXmlElement != nil && dec.currInnerText != nil { |
| 163 | + // Only support text content within an element that has no sub-elements. |
| 164 | + if len(dec.currXmlElement.children) == 0 { |
| 165 | + dec.appendToCurrFieldStruct(dec.currInnerText) |
| 166 | + dec.currInnerText = nil |
| 167 | + } else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace { |
| 168 | + // If there is text content within an element that has sub-elements, return an error. |
| 169 | + return mixedContentError() |
| 170 | + } |
| 171 | + } |
| 172 | + // For the xmlElement hierarchy: step back up the XML hierarchy. |
| 173 | + if dec.currXmlElement != nil { |
| 174 | + dec.currXmlElement = dec.currXmlElement.parent |
| 175 | + } |
| 176 | + // For the CUE ast: end current element, and step back up the XML hierarchy. |
| 177 | + if len(dec.ancestors) > 0 { |
| 178 | + dec.currField = dec.ancestors[len(dec.ancestors)-1] |
| 179 | + dec.ancestors = dec.ancestors[:len(dec.ancestors)-1] |
| 180 | + } |
| 181 | + return nil |
| 182 | +} |
| 183 | + |
| 184 | +func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error { |
| 185 | + // Covers the root node. |
| 186 | + if dec.currField.field == nil { |
| 187 | + dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr} |
| 188 | + cueElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) |
| 189 | + if err != nil { |
| 190 | + return err |
| 191 | + } |
| 192 | + dec.currField.assignNewCurrField(cueElement) |
| 193 | + dec.astRoot = ast.NewStruct(dec.currField.field) |
| 194 | + ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos)) |
| 195 | + return nil |
| 196 | + } |
| 197 | + // If this is not the root node, check if there is text content within the element. |
| 198 | + if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace { |
| 199 | + return mixedContentError() |
| 200 | + } |
| 201 | + // Clear any whitespace text content. |
| 202 | + dec.currInnerText = nil |
| 203 | + // For xmlElement hierarchy: step down the XML hierarchy. |
| 204 | + parentXmlNode := dec.currXmlElement |
| 205 | + dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode} |
| 206 | + parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement) |
| 207 | + // For the CUE ast: step down the CUE hierarchy. |
| 208 | + dec.ancestors = append(dec.ancestors, dec.currField) |
| 209 | + newElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset) |
| 210 | + if err != nil { |
| 211 | + return err |
| 212 | + } |
| 213 | + // Check if this new XML element has a name that's been seen before at the current level. |
| 214 | + prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement) |
| 215 | + sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName] |
| 216 | + if sameNameElements != nil { |
| 217 | + list, ok := sameNameElements.Value.(*ast.ListLit) |
| 218 | + // If the field's value is not a ListLit, create a new ListLit and append the existing field. |
| 219 | + if !ok { |
| 220 | + list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}} |
| 221 | + sameNameElements.Value = list |
| 222 | + } |
| 223 | + // Append the new element to the ListLit, which we now know exists. |
| 224 | + list.Elts = append(list.Elts, newElement.Value) |
| 225 | + dec.currField.assignNewCurrField(newElement) |
| 226 | + return nil |
| 227 | + } |
| 228 | + dec.currField.currFieldChildren[prefixedXmlElementName] = newElement |
| 229 | + dec.appendToCurrFieldStruct(newElement) |
| 230 | + dec.currField.assignNewCurrField(newElement) |
| 231 | + return nil |
| 232 | +} |
| 233 | + |
| 234 | +func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) { |
| 235 | + dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field) |
| 236 | +} |
| 237 | + |
| 238 | +func mixedContentError() error { |
| 239 | + return fmt.Errorf("text content within an XML element that has sub-elements is not supported") |
| 240 | +} |
| 241 | + |
| 242 | +func isWhiteSpace(s string) bool { |
| 243 | + for _, r := range s { |
| 244 | + if !unicode.IsSpace(r) { |
| 245 | + return false |
| 246 | + } |
| 247 | + } |
| 248 | + return true |
| 249 | +} |
| 250 | + |
| 251 | +// cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information |
| 252 | +// in [xml.StartElement] and [xmlElement]. The startOffset represents the offset |
| 253 | +// for the beginning of the start tag of the given XML element. |
| 254 | +func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) (*ast.Field, error) { |
| 255 | + elementName := prefixedElementName(elem, xmlNode) |
| 256 | + resLabel := ast.NewString(elementName) |
| 257 | + pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos) |
| 258 | + resLabel.ValuePos = pos |
| 259 | + resultValue := &ast.StructLit{} |
| 260 | + result := &ast.Field{ |
| 261 | + Label: resLabel, |
| 262 | + Value: resultValue, |
| 263 | + TokenPos: pos, |
| 264 | + } |
| 265 | + // Extract attributes as children. |
| 266 | + for _, a := range elem.Attr { |
| 267 | + attrName := prefixedAttrName(a, elem, xmlNode) |
| 268 | + label := ast.NewString(attributeSymbol + attrName) |
| 269 | + value := toBasicLit(a.Value) |
| 270 | + label.ValuePos = pos |
| 271 | + value.ValuePos = pos |
| 272 | + attrExpr := &ast.Field{ |
| 273 | + Label: label, |
| 274 | + Value: value, |
| 275 | + TokenPos: pos, |
| 276 | + } |
| 277 | + resultValue.Elts = append(resultValue.Elts, attrExpr) |
| 278 | + } |
| 279 | + return result, nil |
| 280 | +} |
| 281 | + |
| 282 | +// prefixedElementName returns the full name of an element, |
| 283 | +// including its namespace prefix if it has one; but without namespace prefix if it is "xmlns". |
| 284 | +func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string { |
| 285 | + elementName := elem.Name.Local |
| 286 | + if elem.Name.Space != "" { |
| 287 | + prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode) |
| 288 | + if prefixNS != "xmlns" { |
| 289 | + elementName = prefixNS + ":" + elem.Name.Local |
| 290 | + } |
| 291 | + } |
| 292 | + return elementName |
| 293 | +} |
| 294 | + |
| 295 | +// prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one. |
| 296 | +func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string { |
| 297 | + attrName := a.Name.Local |
| 298 | + if a.Name.Space != "" { |
| 299 | + prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode) |
| 300 | + attrName = prefix + ":" + a.Name.Local |
| 301 | + } |
| 302 | + return attrName |
| 303 | +} |
| 304 | + |
| 305 | +func toBasicLit(s string) *ast.BasicLit { |
| 306 | + s = strings.ReplaceAll(s, "\r", "") |
| 307 | + return ast.NewString(s) |
| 308 | +} |
| 309 | + |
| 310 | +// nsPrefix finds the prefix label for a given namespace by looking at the current node's |
| 311 | +// attributes and then walking up the hierarchy of XML nodes. |
| 312 | +func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string { |
| 313 | + // When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser. |
| 314 | + if nameSpace == "xmlns" { |
| 315 | + return "xmlns" |
| 316 | + } |
| 317 | + for _, attr := range attributes { |
| 318 | + if attr.Value == nameSpace { |
| 319 | + return attr.Name.Local |
| 320 | + } |
| 321 | + } |
| 322 | + if xmlNode.parent != nil { |
| 323 | + return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent) |
| 324 | + } |
| 325 | + panic("could not find prefix for namespace " + nameSpace) |
| 326 | +} |
| 327 | + |
| 328 | +func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) { |
| 329 | + cf.field = field |
| 330 | + cf.currFieldChildren = make(map[string]*ast.Field) |
| 331 | +} |
0 commit comments