package hclwrite import ( "fmt" "sort" "github.com/hashicorp/hcl2/hcl" "github.com/hashicorp/hcl2/hcl/hclsyntax" "github.com/zclconf/go-cty/cty" ) // Our "parser" here is actually not doing any parsing of its own. Instead, // it leans on the native parser in hclsyntax, and then uses the source ranges // from the AST to partition the raw token sequence to match the raw tokens // up to AST nodes. // // This strategy feels somewhat counter-intuitive, since most of the work the // parser does is thrown away here, but this strategy is chosen because the // normal parsing work done by hclsyntax is considered to be the "main case", // while modifying and re-printing source is more of an edge case, used only // in ancillary tools, and so it's good to keep all the main parsing logic // with the main case but keep all of the extra complexity of token wrangling // out of the main parser, which is already rather complex just serving the // use-cases it already serves. // // If the parsing step produces any errors, the returned File is nil because // we can't reliably extract tokens from the partial AST produced by an // erroneous parse. func parse(src []byte, filename string, start hcl.Pos) (*File, hcl.Diagnostics) { file, diags := hclsyntax.ParseConfig(src, filename, start) if diags.HasErrors() { return nil, diags } // To do our work here, we use the "native" tokens (those from hclsyntax) // to match against source ranges in the AST, but ultimately produce // slices from our sequence of "writer" tokens, which contain only // *relative* position information that is more appropriate for // transformation/writing use-cases. nativeTokens, diags := hclsyntax.LexConfig(src, filename, start) if diags.HasErrors() { // should never happen, since we would've caught these diags in // the first call above. return nil, diags } writerTokens := writerTokens(nativeTokens) from := inputTokens{ nativeTokens: nativeTokens, writerTokens: writerTokens, } before, root, after := parseBody(file.Body.(*hclsyntax.Body), from) ret := &File{ inTree: newInTree(), srcBytes: src, body: root, } nodes := ret.inTree.children nodes.Append(before.Tokens()) nodes.AppendNode(root) nodes.Append(after.Tokens()) return ret, diags } type inputTokens struct { nativeTokens hclsyntax.Tokens writerTokens Tokens } func (it inputTokens) Partition(rng hcl.Range) (before, within, after inputTokens) { start, end := partitionTokens(it.nativeTokens, rng) before = it.Slice(0, start) within = it.Slice(start, end) after = it.Slice(end, len(it.nativeTokens)) return } func (it inputTokens) PartitionType(ty hclsyntax.TokenType) (before, within, after inputTokens) { for i, t := range it.writerTokens { if t.Type == ty { return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens)) } } panic(fmt.Sprintf("didn't find any token of type %s", ty)) } func (it inputTokens) PartitionTypeSingle(ty hclsyntax.TokenType) (before inputTokens, found *Token, after inputTokens) { before, within, after := it.PartitionType(ty) if within.Len() != 1 { panic("PartitionType found more than one token") } return before, within.Tokens()[0], after } // PartitionIncludeComments is like Partition except the returned "within" // range includes any lead and line comments associated with the range. func (it inputTokens) PartitionIncludingComments(rng hcl.Range) (before, within, after inputTokens) { start, end := partitionTokens(it.nativeTokens, rng) start = partitionLeadCommentTokens(it.nativeTokens[:start]) _, afterNewline := partitionLineEndTokens(it.nativeTokens[end:]) end += afterNewline before = it.Slice(0, start) within = it.Slice(start, end) after = it.Slice(end, len(it.nativeTokens)) return } // PartitionBlockItem is similar to PartitionIncludeComments but it returns // the comments as separate token sequences so that they can be captured into // AST attributes. It makes assumptions that apply only to block items, so // should not be used for other constructs. func (it inputTokens) PartitionBlockItem(rng hcl.Range) (before, leadComments, within, lineComments, newline, after inputTokens) { before, within, after = it.Partition(rng) before, leadComments = before.PartitionLeadComments() lineComments, newline, after = after.PartitionLineEndTokens() return } func (it inputTokens) PartitionLeadComments() (before, within inputTokens) { start := partitionLeadCommentTokens(it.nativeTokens) before = it.Slice(0, start) within = it.Slice(start, len(it.nativeTokens)) return } func (it inputTokens) PartitionLineEndTokens() (comments, newline, after inputTokens) { afterComments, afterNewline := partitionLineEndTokens(it.nativeTokens) comments = it.Slice(0, afterComments) newline = it.Slice(afterComments, afterNewline) after = it.Slice(afterNewline, len(it.nativeTokens)) return } func (it inputTokens) Slice(start, end int) inputTokens { // When we slice, we create a new slice with no additional capacity because // we expect that these slices will be mutated in order to insert // new code into the AST, and we want to ensure that a new underlying // array gets allocated in that case, rather than writing into some // following slice and corrupting it. return inputTokens{ nativeTokens: it.nativeTokens[start:end:end], writerTokens: it.writerTokens[start:end:end], } } func (it inputTokens) Len() int { return len(it.nativeTokens) } func (it inputTokens) Tokens() Tokens { return it.writerTokens } func (it inputTokens) Types() []hclsyntax.TokenType { ret := make([]hclsyntax.TokenType, len(it.nativeTokens)) for i, tok := range it.nativeTokens { ret[i] = tok.Type } return ret } // parseBody locates the given body within the given input tokens and returns // the resulting *Body object as well as the tokens that appeared before and // after it. func parseBody(nativeBody *hclsyntax.Body, from inputTokens) (inputTokens, *node, inputTokens) { before, within, after := from.PartitionIncludingComments(nativeBody.SrcRange) // The main AST doesn't retain the original source ordering of the // body items, so we need to reconstruct that ordering by inspecting // their source ranges. nativeItems := make([]hclsyntax.Node, 0, len(nativeBody.Attributes)+len(nativeBody.Blocks)) for _, nativeAttr := range nativeBody.Attributes { nativeItems = append(nativeItems, nativeAttr) } for _, nativeBlock := range nativeBody.Blocks { nativeItems = append(nativeItems, nativeBlock) } sort.Sort(nativeNodeSorter{nativeItems}) body := &Body{ inTree: newInTree(), items: newNodeSet(), } remain := within for _, nativeItem := range nativeItems { beforeItem, item, afterItem := parseBodyItem(nativeItem, remain) if beforeItem.Len() > 0 { body.AppendUnstructuredTokens(beforeItem.Tokens()) } body.appendItemNode(item) remain = afterItem } if remain.Len() > 0 { body.AppendUnstructuredTokens(remain.Tokens()) } return before, newNode(body), after } func parseBodyItem(nativeItem hclsyntax.Node, from inputTokens) (inputTokens, *node, inputTokens) { before, leadComments, within, lineComments, newline, after := from.PartitionBlockItem(nativeItem.Range()) var item *node switch tItem := nativeItem.(type) { case *hclsyntax.Attribute: item = parseAttribute(tItem, within, leadComments, lineComments, newline) case *hclsyntax.Block: item = parseBlock(tItem, within, leadComments, lineComments, newline) default: // should never happen if caller is behaving panic("unsupported native item type") } return before, item, after } func parseAttribute(nativeAttr *hclsyntax.Attribute, from, leadComments, lineComments, newline inputTokens) *node { attr := &Attribute{ inTree: newInTree(), } children := attr.inTree.children { cn := newNode(newComments(leadComments.Tokens())) attr.leadComments = cn children.AppendNode(cn) } before, nameTokens, from := from.Partition(nativeAttr.NameRange) { children.AppendUnstructuredTokens(before.Tokens()) if nameTokens.Len() != 1 { // Should never happen with valid input panic("attribute name is not exactly one token") } token := nameTokens.Tokens()[0] in := newNode(newIdentifier(token)) attr.name = in children.AppendNode(in) } before, equalsTokens, from := from.Partition(nativeAttr.EqualsRange) children.AppendUnstructuredTokens(before.Tokens()) children.AppendUnstructuredTokens(equalsTokens.Tokens()) before, exprTokens, from := from.Partition(nativeAttr.Expr.Range()) { children.AppendUnstructuredTokens(before.Tokens()) exprNode := parseExpression(nativeAttr.Expr, exprTokens) attr.expr = exprNode children.AppendNode(exprNode) } { cn := newNode(newComments(lineComments.Tokens())) attr.lineComments = cn children.AppendNode(cn) } children.AppendUnstructuredTokens(newline.Tokens()) // Collect any stragglers, though there shouldn't be any children.AppendUnstructuredTokens(from.Tokens()) return newNode(attr) } func parseBlock(nativeBlock *hclsyntax.Block, from, leadComments, lineComments, newline inputTokens) *node { block := &Block{ inTree: newInTree(), labels: newNodeSet(), } children := block.inTree.children { cn := newNode(newComments(leadComments.Tokens())) block.leadComments = cn children.AppendNode(cn) } before, typeTokens, from := from.Partition(nativeBlock.TypeRange) { children.AppendUnstructuredTokens(before.Tokens()) if typeTokens.Len() != 1 { // Should never happen with valid input panic("block type name is not exactly one token") } token := typeTokens.Tokens()[0] in := newNode(newIdentifier(token)) block.typeName = in children.AppendNode(in) } for _, rng := range nativeBlock.LabelRanges { var labelTokens inputTokens before, labelTokens, from = from.Partition(rng) children.AppendUnstructuredTokens(before.Tokens()) tokens := labelTokens.Tokens() ln := newNode(newQuoted(tokens)) block.labels.Add(ln) children.AppendNode(ln) } before, oBrace, from := from.Partition(nativeBlock.OpenBraceRange) children.AppendUnstructuredTokens(before.Tokens()) children.AppendUnstructuredTokens(oBrace.Tokens()) // We go a bit out of order here: we go hunting for the closing brace // so that we have a delimited body, but then we'll deal with the body // before we actually append the closing brace and any straggling tokens // that appear after it. bodyTokens, cBrace, from := from.Partition(nativeBlock.CloseBraceRange) before, body, after := parseBody(nativeBlock.Body, bodyTokens) children.AppendUnstructuredTokens(before.Tokens()) block.body = body children.AppendNode(body) children.AppendUnstructuredTokens(after.Tokens()) children.AppendUnstructuredTokens(cBrace.Tokens()) // stragglers children.AppendUnstructuredTokens(from.Tokens()) if lineComments.Len() > 0 { // blocks don't actually have line comments, so we'll just treat // them as extra stragglers children.AppendUnstructuredTokens(lineComments.Tokens()) } children.AppendUnstructuredTokens(newline.Tokens()) return newNode(block) } func parseExpression(nativeExpr hclsyntax.Expression, from inputTokens) *node { expr := newExpression() children := expr.inTree.children nativeVars := nativeExpr.Variables() for _, nativeTraversal := range nativeVars { before, traversal, after := parseTraversal(nativeTraversal, from) children.AppendUnstructuredTokens(before.Tokens()) children.AppendNode(traversal) expr.absTraversals.Add(traversal) from = after } // Attach any stragglers that don't belong to a traversal to the expression // itself. In an expression with no traversals at all, this is just the // entirety of "from". children.AppendUnstructuredTokens(from.Tokens()) return newNode(expr) } func parseTraversal(nativeTraversal hcl.Traversal, from inputTokens) (before inputTokens, n *node, after inputTokens) { traversal := newTraversal() children := traversal.inTree.children before, from, after = from.Partition(nativeTraversal.SourceRange()) stepAfter := from for _, nativeStep := range nativeTraversal { before, step, after := parseTraversalStep(nativeStep, stepAfter) children.AppendUnstructuredTokens(before.Tokens()) children.AppendNode(step) traversal.steps.Add(step) stepAfter = after } return before, newNode(traversal), after } func parseTraversalStep(nativeStep hcl.Traverser, from inputTokens) (before inputTokens, n *node, after inputTokens) { var children *nodes switch tNativeStep := nativeStep.(type) { case hcl.TraverseRoot, hcl.TraverseAttr: step := newTraverseName() children = step.inTree.children before, from, after = from.Partition(nativeStep.SourceRange()) inBefore, token, inAfter := from.PartitionTypeSingle(hclsyntax.TokenIdent) name := newIdentifier(token) children.AppendUnstructuredTokens(inBefore.Tokens()) step.name = children.Append(name) children.AppendUnstructuredTokens(inAfter.Tokens()) return before, newNode(step), after case hcl.TraverseIndex: step := newTraverseIndex() children = step.inTree.children before, from, after = from.Partition(nativeStep.SourceRange()) var inBefore, oBrack, keyTokens, cBrack inputTokens inBefore, oBrack, from = from.PartitionType(hclsyntax.TokenOBrack) children.AppendUnstructuredTokens(inBefore.Tokens()) children.AppendUnstructuredTokens(oBrack.Tokens()) keyTokens, cBrack, from = from.PartitionType(hclsyntax.TokenCBrack) keyVal := tNativeStep.Key switch keyVal.Type() { case cty.String: key := newQuoted(keyTokens.Tokens()) step.key = children.Append(key) case cty.Number: valBefore, valToken, valAfter := keyTokens.PartitionTypeSingle(hclsyntax.TokenNumberLit) children.AppendUnstructuredTokens(valBefore.Tokens()) key := newNumber(valToken) step.key = children.Append(key) children.AppendUnstructuredTokens(valAfter.Tokens()) } children.AppendUnstructuredTokens(cBrack.Tokens()) children.AppendUnstructuredTokens(from.Tokens()) return before, newNode(step), after default: panic(fmt.Sprintf("unsupported traversal step type %T", nativeStep)) } } // writerTokens takes a sequence of tokens as produced by the main hclsyntax // package and transforms it into an equivalent sequence of tokens using // this package's own token model. // // The resulting list contains the same number of tokens and uses the same // indices as the input, allowing the two sets of tokens to be correlated // by index. func writerTokens(nativeTokens hclsyntax.Tokens) Tokens { // Ultimately we want a slice of token _pointers_, but since we can // predict how much memory we're going to devote to tokens we'll allocate // it all as a single flat buffer and thus give the GC less work to do. tokBuf := make([]Token, len(nativeTokens)) var lastByteOffset int for i, mainToken := range nativeTokens { // Create a copy of the bytes so that we can mutate without // corrupting the original token stream. bytes := make([]byte, len(mainToken.Bytes)) copy(bytes, mainToken.Bytes) tokBuf[i] = Token{ Type: mainToken.Type, Bytes: bytes, // We assume here that spaces are always ASCII spaces, since // that's what the scanner also assumes, and thus the number // of bytes skipped is also the number of space characters. SpacesBefore: mainToken.Range.Start.Byte - lastByteOffset, } lastByteOffset = mainToken.Range.End.Byte } // Now make a slice of pointers into the previous slice. ret := make(Tokens, len(tokBuf)) for i := range ret { ret[i] = &tokBuf[i] } return ret } // partitionTokens takes a sequence of tokens and a hcl.Range and returns // two indices within the token sequence that correspond with the range // boundaries, such that the slice operator could be used to produce // three token sequences for before, within, and after respectively: // // start, end := partitionTokens(toks, rng) // before := toks[:start] // within := toks[start:end] // after := toks[end:] // // This works best when the range is aligned with token boundaries (e.g. // because it was produced in terms of the scanner's result) but if that isn't // true then it will make a best effort that may produce strange results at // the boundaries. // // Native hclsyntax tokens are used here, because they contain the necessary // absolute position information. However, since writerTokens produces a // correlatable sequence of writer tokens, the resulting indices can be // used also to index into its result, allowing the partitioning of writer // tokens to be driven by the partitioning of native tokens. // // The tokens are assumed to be in source order and non-overlapping, which // will be true if the token sequence from the scanner is used directly. func partitionTokens(toks hclsyntax.Tokens, rng hcl.Range) (start, end int) { // We us a linear search here because we assume tha in most cases our // target range is close to the beginning of the sequence, and the seqences // are generally small for most reasonable files anyway. for i := 0; ; i++ { if i >= len(toks) { // No tokens for the given range at all! return len(toks), len(toks) } if toks[i].Range.Start.Byte >= rng.Start.Byte { start = i break } } for i := start; ; i++ { if i >= len(toks) { // The range "hangs off" the end of the token sequence return start, len(toks) } if toks[i].Range.Start.Byte >= rng.End.Byte { end = i // end marker is exclusive break } } return start, end } // partitionLeadCommentTokens takes a sequence of tokens that is assumed // to immediately precede a construct that can have lead comment tokens, // and returns the index into that sequence where the lead comments begin. // // Lead comments are defined as whole lines containing only comment tokens // with no blank lines between. If no such lines are found, the returned // index will be len(toks). func partitionLeadCommentTokens(toks hclsyntax.Tokens) int { // single-line comments (which is what we're interested in here) // consume their trailing newline, so we can just walk backwards // until we stop seeing comment tokens. for i := len(toks) - 1; i >= 0; i-- { if toks[i].Type != hclsyntax.TokenComment { return i + 1 } } return 0 } // partitionLineEndTokens takes a sequence of tokens that is assumed // to immediately follow a construct that can have a line comment, and // returns first the index where any line comments end and then second // the index immediately after the trailing newline. // // Line comments are defined as comments that appear immediately after // a construct on the same line where its significant tokens ended. // // Since single-line comment tokens (# and //) include the newline that // terminates them, in the presence of these the two returned indices // will be the same since the comment itself serves as the line end. func partitionLineEndTokens(toks hclsyntax.Tokens) (afterComment, afterNewline int) { for i := 0; i < len(toks); i++ { tok := toks[i] if tok.Type != hclsyntax.TokenComment { switch tok.Type { case hclsyntax.TokenNewline: return i, i + 1 case hclsyntax.TokenEOF: // Although this is valid, we mustn't include the EOF // itself as our "newline" or else strange things will // happen when we try to append new items. return i, i default: // If we have well-formed input here then nothing else should be // possible. This path should never happen, because we only try // to extract tokens from the sequence if the parser succeeded, // and it should catch this problem itself. panic("malformed line trailers: expected only comments and newlines") } } if len(tok.Bytes) > 0 && tok.Bytes[len(tok.Bytes)-1] == '\n' { // Newline at the end of a single-line comment serves both as // the end of comments *and* the end of the line. return i + 1, i + 1 } } return len(toks), len(toks) } // lexConfig uses the hclsyntax scanner to get a token stream and then // rewrites it into this package's token model. // // Any errors produced during scanning are ignored, so the results of this // function should be used with care. func lexConfig(src []byte) Tokens { mainTokens, _ := hclsyntax.LexConfig(src, "", hcl.Pos{Byte: 0, Line: 1, Column: 1}) return writerTokens(mainTokens) }