Skip to content

Commit ebf42e5

Browse files
matts85mvdan
authored andcommitted
encoding/xml/koala: introduce the first experimental XML encoder
This is the initial implementation of the Koala XML encoding as described at https://cuelang.org/discussion/3776. In short, this encoding is inspired by BadgerFish, an XML to JSON translation where XML element attributes and bodies become JSON object key-value pairs with "$" prefixes. We do something very similar in the XML "Koala" encoding, but adapting the logic to fit CUE better. Here we introduce the decoding side only; the encoding side will come as a separate change at a later time. See #3776. Signed-off-by: Matthew Sladescu <[email protected]> Change-Id: If2292c95289fc0229321b7737249dd6057556b03 Reviewed-on: https://review.gerrithub.io/c/cue-lang/cue/+/1211362 Reviewed-by: Daniel Martí <[email protected]> TryBot-Result: CUEcueckoo <[email protected]> Reviewed-by: Chief Cueckoo <[email protected]> Unity-Result: CUE porcuepine <[email protected]>
1 parent c565c1d commit ebf42e5

File tree

2 files changed

+881
-0
lines changed

2 files changed

+881
-0
lines changed

Diff for: encoding/xml/koala/decode.go

+331
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
// Copyright 2025 The CUE Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package koala converts XML to and from CUE, as proposed in https://cuelang.org/discussion/3776.
16+
//
17+
// This encoding is inspired by the [BadgerFish] convention for translating XML to JSON.
18+
// XML elements are modeled as CUE structs, their attributes are modeled as struct fields
19+
// prefixed with "$", and their inner text content is modeleed as a field named "$$".
20+
//
21+
// WARNING: THIS PACKAGE IS EXPERIMENTAL.
22+
// ITS API MAY CHANGE AT ANY TIME.
23+
//
24+
// [BadgerFish]: http://www.sklar.com/badgerfish/
25+
package koala
26+
27+
import (
28+
"bytes"
29+
"encoding/xml"
30+
"fmt"
31+
"io"
32+
"strings"
33+
"unicode"
34+
35+
"cuelang.org/go/cue/ast"
36+
"cuelang.org/go/cue/token"
37+
)
38+
39+
// Decoder implements the decoding state.
40+
type Decoder struct {
41+
reader io.Reader
42+
fileName string
43+
tokenFile *token.File
44+
45+
decoderRan bool
46+
47+
// current XML element being processed.
48+
currXmlElement *xmlElement
49+
50+
// The top-level CUE struct.
51+
astRoot *ast.StructLit
52+
// CUE model of ancestors of current XML element being processed.
53+
ancestors []currFieldInfo
54+
// CUE model of current XML element being processed.
55+
currField currFieldInfo
56+
// CUE model of current XML element's inner content ($$ attribute).
57+
currInnerText *ast.Field
58+
}
59+
60+
// currFieldInfo encapsulates details of the CUE field for the current XML element being processed.
61+
type currFieldInfo struct {
62+
// CUE model of current XML element.
63+
field *ast.Field
64+
// Running map of the current field's children.
65+
currFieldChildren map[string]*ast.Field
66+
}
67+
68+
// xmlElement models an XML Element hierarchy.
69+
// It is used for tracking namespace prefixes.
70+
type xmlElement struct {
71+
xmlName xml.Name
72+
attr []xml.Attr
73+
parent *xmlElement
74+
children []*xmlElement
75+
textContentIsWhiteSpace bool
76+
}
77+
78+
// The prefix used to model the inner text content within an XML element.
79+
const contentAttribute string = "$$"
80+
81+
// The prefix used to model each attribute of an XML element.
82+
const attributeSymbol string = "$"
83+
84+
// NewDecoder creates a decoder from a stream of XML input.
85+
func NewDecoder(fileName string, r io.Reader) *Decoder {
86+
return &Decoder{reader: r, fileName: fileName}
87+
}
88+
89+
// Decode parses the input stream as XML and converts it to a CUE [ast.Expr].
90+
// The input stream is taken from the [Decoder] and consumed.
91+
func (dec *Decoder) Decode() (ast.Expr, error) {
92+
if dec.decoderRan {
93+
return nil, io.EOF
94+
}
95+
dec.decoderRan = true
96+
xmlText, err := io.ReadAll(dec.reader)
97+
if err != nil {
98+
return nil, err
99+
}
100+
reader := bytes.NewReader(xmlText)
101+
xmlDec := xml.NewDecoder(reader)
102+
103+
// Create a token file to track the position of the XML content in the CUE file.
104+
dec.tokenFile = token.NewFile(dec.fileName, 0, len(xmlText))
105+
dec.tokenFile.SetLinesForContent(xmlText)
106+
107+
for {
108+
startOffset := xmlDec.InputOffset()
109+
t, err := xmlDec.Token()
110+
if err == io.EOF {
111+
break
112+
}
113+
if err != nil {
114+
return nil, err
115+
}
116+
switch xmlToken := t.(type) {
117+
case xml.StartElement:
118+
err = dec.decodeStartElement(xmlToken, startOffset)
119+
case xml.CharData:
120+
err = dec.decoderInnerText(xmlToken, startOffset)
121+
case xml.EndElement:
122+
err = dec.decodeEndElement()
123+
}
124+
if err != nil {
125+
return nil, err
126+
}
127+
// If the XML document has ended, break out of the loop.
128+
if dec.astRoot != nil && dec.currXmlElement == nil {
129+
break
130+
}
131+
}
132+
return dec.astRoot, nil
133+
}
134+
135+
func (dec *Decoder) decoderInnerText(xmlToken xml.CharData, contentOffset int64) error {
136+
// If this is text content within an XML element.
137+
textContent := string(xml.CharData(xmlToken))
138+
if dec.currField.field == nil {
139+
if isWhiteSpace(textContent) {
140+
return nil
141+
}
142+
return fmt.Errorf("text content outside of an XML element is not supported")
143+
}
144+
pos := dec.tokenFile.Pos(int(contentOffset), token.NoRelPos)
145+
txtContentPosition := pos
146+
txtLabel := ast.NewString(contentAttribute)
147+
txtLabel.ValuePos = txtContentPosition
148+
val := toBasicLit(textContent)
149+
val.ValuePos = txtContentPosition
150+
textContentNode := &ast.Field{
151+
Label: txtLabel,
152+
Value: val,
153+
TokenPos: pos,
154+
}
155+
dec.currInnerText = textContentNode
156+
dec.currXmlElement.textContentIsWhiteSpace = isWhiteSpace(textContent)
157+
return nil
158+
}
159+
160+
func (dec *Decoder) decodeEndElement() error {
161+
// If there is text content within the element, add it to the element's value.
162+
if dec.currXmlElement != nil && dec.currInnerText != nil {
163+
// Only support text content within an element that has no sub-elements.
164+
if len(dec.currXmlElement.children) == 0 {
165+
dec.appendToCurrFieldStruct(dec.currInnerText)
166+
dec.currInnerText = nil
167+
} else if len(dec.currXmlElement.children) > 0 && !dec.currXmlElement.textContentIsWhiteSpace {
168+
// If there is text content within an element that has sub-elements, return an error.
169+
return mixedContentError()
170+
}
171+
}
172+
// For the xmlElement hierarchy: step back up the XML hierarchy.
173+
if dec.currXmlElement != nil {
174+
dec.currXmlElement = dec.currXmlElement.parent
175+
}
176+
// For the CUE ast: end current element, and step back up the XML hierarchy.
177+
if len(dec.ancestors) > 0 {
178+
dec.currField = dec.ancestors[len(dec.ancestors)-1]
179+
dec.ancestors = dec.ancestors[:len(dec.ancestors)-1]
180+
}
181+
return nil
182+
}
183+
184+
func (dec *Decoder) decodeStartElement(xmlToken xml.StartElement, startOffset int64) error {
185+
// Covers the root node.
186+
if dec.currField.field == nil {
187+
dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr}
188+
cueElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
189+
if err != nil {
190+
return err
191+
}
192+
dec.currField.assignNewCurrField(cueElement)
193+
dec.astRoot = ast.NewStruct(dec.currField.field)
194+
ast.SetPos(dec.astRoot, dec.tokenFile.Pos(0, token.NoRelPos))
195+
return nil
196+
}
197+
// If this is not the root node, check if there is text content within the element.
198+
if dec.currInnerText != nil && !dec.currXmlElement.textContentIsWhiteSpace {
199+
return mixedContentError()
200+
}
201+
// Clear any whitespace text content.
202+
dec.currInnerText = nil
203+
// For xmlElement hierarchy: step down the XML hierarchy.
204+
parentXmlNode := dec.currXmlElement
205+
dec.currXmlElement = &xmlElement{xmlName: xmlToken.Name, attr: xmlToken.Attr, parent: parentXmlNode}
206+
parentXmlNode.children = append(parentXmlNode.children, dec.currXmlElement)
207+
// For the CUE ast: step down the CUE hierarchy.
208+
dec.ancestors = append(dec.ancestors, dec.currField)
209+
newElement, err := dec.cueFieldFromXmlElement(xmlToken, dec.currXmlElement, startOffset)
210+
if err != nil {
211+
return err
212+
}
213+
// Check if this new XML element has a name that's been seen before at the current level.
214+
prefixedXmlElementName := prefixedElementName(xmlToken, dec.currXmlElement)
215+
sameNameElements := dec.currField.currFieldChildren[prefixedXmlElementName]
216+
if sameNameElements != nil {
217+
list, ok := sameNameElements.Value.(*ast.ListLit)
218+
// If the field's value is not a ListLit, create a new ListLit and append the existing field.
219+
if !ok {
220+
list = &ast.ListLit{Elts: []ast.Expr{sameNameElements.Value}}
221+
sameNameElements.Value = list
222+
}
223+
// Append the new element to the ListLit, which we now know exists.
224+
list.Elts = append(list.Elts, newElement.Value)
225+
dec.currField.assignNewCurrField(newElement)
226+
return nil
227+
}
228+
dec.currField.currFieldChildren[prefixedXmlElementName] = newElement
229+
dec.appendToCurrFieldStruct(newElement)
230+
dec.currField.assignNewCurrField(newElement)
231+
return nil
232+
}
233+
234+
func (dec *Decoder) appendToCurrFieldStruct(field *ast.Field) {
235+
dec.currField.field.Value.(*ast.StructLit).Elts = append(dec.currField.field.Value.(*ast.StructLit).Elts, field)
236+
}
237+
238+
func mixedContentError() error {
239+
return fmt.Errorf("text content within an XML element that has sub-elements is not supported")
240+
}
241+
242+
func isWhiteSpace(s string) bool {
243+
for _, r := range s {
244+
if !unicode.IsSpace(r) {
245+
return false
246+
}
247+
}
248+
return true
249+
}
250+
251+
// cueFieldFromXmlElement creates a new [ast.Field] to model the given xml element information
252+
// in [xml.StartElement] and [xmlElement]. The startOffset represents the offset
253+
// for the beginning of the start tag of the given XML element.
254+
func (dec *Decoder) cueFieldFromXmlElement(elem xml.StartElement, xmlNode *xmlElement, startOffset int64) (*ast.Field, error) {
255+
elementName := prefixedElementName(elem, xmlNode)
256+
resLabel := ast.NewString(elementName)
257+
pos := dec.tokenFile.Pos(int(startOffset), token.NoRelPos)
258+
resLabel.ValuePos = pos
259+
resultValue := &ast.StructLit{}
260+
result := &ast.Field{
261+
Label: resLabel,
262+
Value: resultValue,
263+
TokenPos: pos,
264+
}
265+
// Extract attributes as children.
266+
for _, a := range elem.Attr {
267+
attrName := prefixedAttrName(a, elem, xmlNode)
268+
label := ast.NewString(attributeSymbol + attrName)
269+
value := toBasicLit(a.Value)
270+
label.ValuePos = pos
271+
value.ValuePos = pos
272+
attrExpr := &ast.Field{
273+
Label: label,
274+
Value: value,
275+
TokenPos: pos,
276+
}
277+
resultValue.Elts = append(resultValue.Elts, attrExpr)
278+
}
279+
return result, nil
280+
}
281+
282+
// prefixedElementName returns the full name of an element,
283+
// including its namespace prefix if it has one; but without namespace prefix if it is "xmlns".
284+
func prefixedElementName(elem xml.StartElement, xmlNode *xmlElement) string {
285+
elementName := elem.Name.Local
286+
if elem.Name.Space != "" {
287+
prefixNS := nsPrefix(elem.Name.Space, elem.Attr, xmlNode)
288+
if prefixNS != "xmlns" {
289+
elementName = prefixNS + ":" + elem.Name.Local
290+
}
291+
}
292+
return elementName
293+
}
294+
295+
// prefixedAttrName returns the full name of an attribute, including its namespace prefix if it has one.
296+
func prefixedAttrName(a xml.Attr, elem xml.StartElement, xmlNode *xmlElement) string {
297+
attrName := a.Name.Local
298+
if a.Name.Space != "" {
299+
prefix := nsPrefix(a.Name.Space, elem.Attr, xmlNode)
300+
attrName = prefix + ":" + a.Name.Local
301+
}
302+
return attrName
303+
}
304+
305+
func toBasicLit(s string) *ast.BasicLit {
306+
s = strings.ReplaceAll(s, "\r", "")
307+
return ast.NewString(s)
308+
}
309+
310+
// nsPrefix finds the prefix label for a given namespace by looking at the current node's
311+
// attributes and then walking up the hierarchy of XML nodes.
312+
func nsPrefix(nameSpace string, attributes []xml.Attr, xmlNode *xmlElement) string {
313+
// When the prefix is xmlns, then the namespace is xmlns according to the golang XML parser.
314+
if nameSpace == "xmlns" {
315+
return "xmlns"
316+
}
317+
for _, attr := range attributes {
318+
if attr.Value == nameSpace {
319+
return attr.Name.Local
320+
}
321+
}
322+
if xmlNode.parent != nil {
323+
return nsPrefix(nameSpace, xmlNode.parent.attr, xmlNode.parent)
324+
}
325+
panic("could not find prefix for namespace " + nameSpace)
326+
}
327+
328+
func (cf *currFieldInfo) assignNewCurrField(field *ast.Field) {
329+
cf.field = field
330+
cf.currFieldChildren = make(map[string]*ast.Field)
331+
}

0 commit comments

Comments
 (0)