pulumi · iwahbe · Aug 8, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/pkg/tfgen/docs.go b/pkg/tfgen/docs.go
@@ -386,9 +386,85 @@ func groupLines(lines []string, sep string) [][]string {
 	return sections
 }
 
-// splitGroupLines splits and groups a string, s, by a given separator, sep.
-func splitGroupLines(s, sep string) [][]string {
-	return groupLines(strings.Split(s, "\n"), sep)
+func trimFrontMatter(text []byte) []byte {
+	delineater := []byte("---")
+	body, ok := bytes.CutPrefix(text, delineater)
+	if !ok {
+		return text
+	}
+	idx := bytes.Index(body, delineater)
+
+	// Unable to find closing, so just return.
+	if idx == -1 {
+		return text
+	}
+	return body[idx+3:]
+}
+
+func splitByMdHeaders(text string, level int) [][]string {
+	bytes := trimFrontMatter([]byte(text))
+	idx := 0
+	headers := []int{}
+
+	parseDoc(bytes).Walk(func(node *bf.Node, entering bool) bf.WalkStatus {
+		if !entering {
+			return bf.GoToNext
+		}
+
+		if node.Type != bf.Heading || node.HeadingData.Level != level || node.HeadingData.IsTitleblock {
+			return bf.GoToNext
+		}
+		var foundHeader bool
+		for ; idx < len(bytes); idx++ {
+			// Here we take advantage of the fact that the .Literal field on
+			// leaf nodes is a view into the same byte array that was passed
+			// into `parseDoc` to recover the index of of .Literal[0] in the
+			// original array.
+			if &bytes[idx] == &node.FirstChild.Literal[0] {
+				// We have found in `bytes` the location of a header text,
+				// but we want the start of the line. We need to walk
+				// back.
+				for i := idx; i > 0; i-- {
+					if bytes[i] == '\n' {
+						headers = append(headers, i+1)
+						break
+					}
+				}
+				foundHeader = true
+				break
+			}
+		}
+		contract.Assertf(foundHeader, "Failed to find source location of a header")
+		return bf.GoToNext
+	})
+
+	// headers now contains the index into `bytes` that represent the start of each
+	// section.
+	//
+	// We now use that information to extract sections from `text`.
+
+	offset := len(text) - len(bytes)
+	contract.Assertf(offset >= 0, "The offset generated by chopping of the front-matter cannot be negative")
+
+	sections := make([][]string, 0, len(headers)+1)
+
+	if len(headers) == 0 {
+		return [][]string{strings.Split(text, "\n")}
+	}
+
+	// Account for the first section
+	sections = append(sections, strings.Split(text[:headers[0]+offset-1], "\n"))
+
+	// Now handle the middle section
+	for from := 0; from+1 < len(headers); from++ {
+		sections = append(sections,
+			strings.Split(text[headers[from]+offset:headers[from+1]+offset-1], "\n"))
+	}
+
+	// Account for the end section
+	sections = append(sections, strings.Split(text[headers[len(headers)-1]+offset:], "\n"))
+
+	return sections
 }
 
 // parseTFMarkdown takes a TF website markdown doc and extracts a structured representation for use in
@@ -475,7 +551,7 @@ func (p *tfMarkdownParser) parse(tfMarkdown []byte) (entityDocs, error) {
 	markdown = strings.Replace(markdown, "<!-- schema generated by tfplugindocs -->", "", -1)
 
 	// Split the sections by H2 topics in the Markdown file.
-	sections := splitGroupLines(markdown, "## ")
+	sections := splitByMdHeaders(markdown, 2)
 
 	// we are explicitly overwriting the Terraform examples here
 	if p.info != nil && p.info.GetDocs() != nil && p.info.ReplaceExamplesSection() {

diff --git a/pkg/tfgen/docs_test.go b/pkg/tfgen/docs_test.go
@@ -688,7 +688,7 @@ Provides a DigitalOcean CDN Endpoint resource for use with Spaces.
 ## Argument Reference`
 
 		var processedMarkdown string
-		groups := splitGroupLines(markdown, "## ")
+		groups := splitByMdHeaders(markdown, 2)
 		for _, lines := range groups {
 			fixExampleTitles(lines)
 			for _, line := range lines {
@@ -715,7 +715,7 @@ Misleading example title without any actual code fences. We should not modify th
 ## Argument Reference`
 
 		var processedMarkdown string
-		groups := splitGroupLines(markdown, "## ")
+		groups := splitByMdHeaders(markdown, 2)
 		for _, lines := range groups {
 			fixExampleTitles(lines)
 			for _, line := range lines {
@@ -753,38 +753,43 @@ Basic usage:`
 
 func TestReformatExamples(t *testing.T) {
 	runTest := func(input string, expected [][]string) {
-		inputSections := splitGroupLines(input, "## ")
-		output := reformatExamples(inputSections)
+		inputSections := splitByMdHeaders(input, 2)
+		actual := reformatExamples(inputSections)
 
-		assert.ElementsMatch(t, expected, output)
+		assert.Equal(t, expected, actual)
 	}
 
 	// This is a simple use case. We expect no changes to the original doc:
-	simpleDoc := `description
+	t.Run("no-op", func(t *testing.T) {
+		input := `description
 
 ## Example Usage
 
 example usage content`
 
-	simpleDocExpected := [][]string{
-		{
-			"description",
-			"",
-		},
-		{
-			"## Example Usage",
-			"",
-			"example usage content",
-		},
-	}
+		expected := [][]string{
+			{
+				"description",
+				"",
+			},
+			{
+				"## Example Usage",
+				"",
+				"example usage content",
+			},
+		}
 
-	runTest(simpleDoc, simpleDocExpected)
+		runTest(input, expected)
+	})
 
-	// This use case demonstrates 2 examples at the same H2 level: a canonical Example Usage and another example
-	// for a specific use case. We expect these to be transformed into a canonical H2 "Example Usage" with an H3 for
-	// the specific use case.
-	// This scenario is common in the pulumi-gcp provider:
-	gcpDoc := `description
+	// This use case demonstrates 2 examples at the same H2 level: a canonical Example
+	// Usage and another example for a specific use case. We expect these to be
+	// transformed into a canonical H2 "Example Usage" with an H3 for the specific use
+	// case.
+	//
+	// This scenario is common in the pulumi-gcp provider.
+	t.Run("multiple-examples-same-level", func(t *testing.T) {
+		input := `description
 
 ## Example Usage
 
@@ -794,28 +799,32 @@ example usage content
 
 specific case content`
 
-	gcpDocExpected := [][]string{
-		{
-			"description",
-			"",
-		},
-		{
-			"## Example Usage",
-			"",
-			"example usage content",
-			"",
-			"### Specific Case",
-			"",
-			"specific case content",
-		},
-	}
+		expected := [][]string{
+			{
+				"description",
+				"",
+			},
+			{
+				"## Example Usage",
+				"",
+				"example usage content",
+				"",
+				"### Specific Case",
+				"",
+				"specific case content",
+			},
+		}
 
-	runTest(gcpDoc, gcpDocExpected)
+		runTest(input, expected)
+	})
 
-	// This use case demonstrates 2 no canonical Example Usage/basic case and 2 specific use cases. We expect the
-	// function to add a canonical Example Usage section with the 2 use cases as H3's beneath the canonical section.
-	// This scenario is common in the pulumi-gcp provider:
-	gcpDoc2 := `description
+	// This use case demonstrates 2 no canonical Example Usage/basic case and 2
+	// specific use cases. We expect the function to add a canonical Example Usage
+	// section with the 2 use cases as H3's beneath the canonical section.
+	//
+	// This scenario is common in the pulumi-gcp provider.
+	t.Run("no-canonical-example-header", func(t *testing.T) {
+		input := `description
 
 ## Example Usage - 1
 
@@ -825,41 +834,44 @@ content 1
 
 content 2`
 
-	gcpDoc2Expected := [][]string{
-		{
-			"description",
-			"",
-		},
-		{
-			"## Example Usage",
-			"### 1",
-			"",
-			"content 1",
-			"",
-			"### 2",
-			"",
-			"content 2",
-		},
-	}
+		expected := [][]string{
+			{
+				"description",
+				"",
+			},
+			{
+				"## Example Usage",
+				"### 1",
+				"",
+				"content 1",
+				"",
+				"### 2",
+				"",
+				"content 2",
+			},
+		}
 
-	runTest(gcpDoc2, gcpDoc2Expected)
+		runTest(input, expected)
+	})
 
-	misformattedDocNoPanic := `## jetstream_kv_entry Resource
+	t.Run("misformatted-docs-dont-panic", func(t *testing.T) {
+		input := `## jetstream_kv_entry Resource
 content
 ### Example
 content`
 
-	misformattedDocsExpected := [][]string{
-		nil,
-		{
-			"## jetstream_kv_entry Resource",
-			"content",
-			"### Example",
-			"content",
-		},
-	}
+		expected := [][]string{
+			nil,
+			{
+				"## jetstream_kv_entry Resource",
+				"content",
+				"### Example",
+				"content",
+			},
+		}
 
-	runTest(misformattedDocNoPanic, misformattedDocsExpected)
+		runTest(input, expected)
+	})
 }
 
 func TestFormatEntityName(t *testing.T) {
@@ -1596,6 +1608,7 @@ func TestParseTFMarkdown(t *testing.T) {
 					[]byte(`CUSTOM_REPLACES`),
 					[]byte(`checking custom replaces`)), nil
 			})),
+		test("codeblock-header"),
 	}
 
 	for _, tt := range tests {

diff --git a/pkg/tfgen/parse_markdown.go b/pkg/tfgen/parse_markdown.go
@@ -443,13 +443,13 @@ func parseTextSeq(firstNode *bf.Node, useStarsForStrongAndEmph bool) (string, er
 	return buffer.String(), err
 }
 
-func parseDoc(text string) *bf.Node {
+func parseDoc(text []byte) *bf.Node {
 	mdProc := bf.New(bf.WithExtensions(bf.FencedCode))
-	return mdProc.Parse([]byte(text))
+	return mdProc.Parse(text)
 }
 
 func parseNode(text string) *bf.Node {
-	return parseDoc(text).FirstChild
+	return parseDoc([]byte(text)).FirstChild
 }
 
 // Used for debugging blackfriday parse trees by visualizing them.

diff --git a/pkg/tfgen/parse_markdown_test.go b/pkg/tfgen/parse_markdown_test.go
@@ -74,7 +74,7 @@ func TestParseTopLevelSchema(t *testing.T) {
 
 	var schema *topLevelSchema
 
-	parseDoc(markdown).Walk(func(node *bf.Node, entering bool) bf.WalkStatus {
+	parseDoc([]byte(markdown)).Walk(func(node *bf.Node, entering bool) bf.WalkStatus {
 		if entering {
 			tls, err := parseTopLevelSchema(node, nil)
 			if err != nil {
@@ -113,7 +113,7 @@ func TestParseTopLevelSchema(t *testing.T) {
 func TestParseNestedSchemaIntoDoc(t *testing.T) {
 	markdown := readTestFile(t, "mini.md")
 	out := &entityDocs{}
-	parseDoc(markdown).Walk(func(node *bf.Node, entering bool) bf.WalkStatus {
+	parseDoc([]byte(markdown)).Walk(func(node *bf.Node, entering bool) bf.WalkStatus {
 		if entering {
 			nested, err := parseNestedSchema(node, nil)
 			if err != nil {

diff --git a/pkg/tfgen/test_data/codeblock-header/expected.json b/pkg/tfgen/test_data/codeblock-header/expected.json
@@ -0,0 +1,41 @@
+{
+  "Description": "!\u003e **V1 release candidate** This resource was reworked and is a release candidate for the V1. We do not expect significant changes in it before the V1. We will welcome any feedback and adjust the resource if needed. Any errors reported will be resolved with a higher priority. We encourage checking this resource out before the V1 release. Please follow the migration guide to use it.\n\nRepresents a standard database. If replication configuration is specified, the database is promoted to serve as a primary database for replication.\n\n## Example Usage\n\n```terraform\n## Minimal\nresource \"snowflake_database\" \"primary\" {\n  name = \"database_name\"\n}\n\n## Complete (with every optional set)\nresource \"snowflake_database\" \"primary\" {\n  name         = \"database_name\"\n  is_transient = false\n  comment      = \"my standard database\"\n\n  data_retention_time_in_days                   = 10\n  data_retention_time_in_days_save              = 10\n  max_data_extension_time_in_days               = 20\n  external_volume                               = \"\u003cexternal_volume_name\u003e\"\n  catalog                                       = \"\u003ccatalog_name\u003e\"\n  replace_invalid_characters                    = false\n  default_ddl_collation                         = \"en_US\"\n  storage_serialization_policy                  = \"COMPATIBLE\"\n  log_level                                     = \"INFO\"\n  trace_level                                   = \"ALWAYS\"\n  suspend_task_after_num_failures               = 10\n  task_auto_retry_attempts                      = 10\n  user_task_managed_initial_warehouse_size      = \"LARGE\"\n  user_task_timeout_ms                          = 3600000\n  user_task_minimum_trigger_interval_in_seconds = 120\n  quoted_identifiers_ignore_case                = false\n  enable_console_output                         = false\n\n  replication {\n    enable_to_account {\n      account_identifier = \"\u003csecondary_account_organization_name\u003e.\u003csecondary_account_name\u003e\"\n      with_failover      = true\n    }\n    ignore_edition_check = true\n  }\n}\n\n## Replication with for_each\nlocals {\n  replication_configs = [\n    {\n      account_identifier = \"\u003csecondary_account_organization_name\u003e.\u003csecondary_account_name\u003e\"\n      with_failover      = true\n    },\n    {\n      account_identifier = \"\u003csecondary_account_organization_name\u003e.\u003csecondary_account_name\u003e\"\n      with_failover      = true\n    },\n  ]\n}\n\nresource \"snowflake_database\" \"primary\" {\n  name     = \"database_name\"\n  for_each = local.replication_configs\n\n  replication {\n    enable_to_account    = each.value\n    ignore_edition_check = true\n  }\n}\n```",
+  "Arguments": {
+    "replication.enable_to_account": {
+      "description": "Entry to enable replication and optionally failover for a given account identifier."
+    },
+    "replication.enable_to_account.account_identifier": {
+      "description": "Specifies account identifier for which replication should be enabled. The account identifiers should be in the form of `\"\u003corganization_name\u003e\".\"\u003caccount_name\u003e\"`."
+    },
+    "replication.enable_to_account.with_failover": {
+      "description": "Specifies if failover should be enabled for the specified account identifier"
+    },
+    "replication.ignore_edition_check": {
+      "description": "Allows replicating data to accounts on lower editions in either of the following scenarios: 1. The primary database is in a Business Critical (or higher) account but one or more of the accounts approved for replication are on lower editions. Business Critical Edition is intended for Snowflake accounts with extremely sensitive data. 2. The primary database is in a Business Critical (or higher) account and a signed business associate agreement is in place to store PHI data in the account per HIPAA and HITRUST regulations, but no such agreement is in place for one or more of the accounts approved for replication, regardless if they are Business Critical (or higher) accounts. Both scenarios are prohibited by default in an effort to help prevent account administrators for Business Critical (or higher) accounts from inadvertently replicating sensitive data to accounts on lower editions."
+    }
+  },
+  "Attributes": {
+    "catalog": "The database parameter that specifies the default catalog to use for Iceberg tables. For more information, see [CATALOG](https://docs.snowflake.com/en/sql-reference/parameters#catalog).",
+    "comment": "Specifies a comment for the database.",
+    "data_retention_time_in_days": "Specifies the number of days for which Time Travel actions (CLONE and UNDROP) can be performed on the database, as well as specifying the default Time Travel retention time for all schemas created in the database. For more details, see [Understanding \u0026 Using Time Travel](https://docs.snowflake.com/en/user-guide/data-time-travel).",
+    "default_ddl_collation": "Specifies a default collation specification for all schemas and tables added to the database. It can be overridden on schema or table level. For more information, see [collation specification](https://docs.snowflake.com/en/sql-reference/collation#label-collation-specification).",
+    "enable_console_output": "If true, enables stdout/stderr fast path logging for anonymous stored procedures.",
+    "external_volume": "The database parameter that specifies the default external volume to use for Iceberg tables. For more information, see [EXTERNAL_VOLUME](https://docs.snowflake.com/en/sql-reference/parameters#external-volume).",
+    "id": "The ID of this resource.",
+    "is_transient": "Specifies the database as transient. Transient databases do not have a Fail-safe period so they do not incur additional storage costs once they leave Time Travel; however, this means they are also not protected by Fail-safe in the event of a data loss.",
+    "log_level": "Specifies the severity level of messages that should be ingested and made available in the active event table. Valid options are: [TRACE DEBUG INFO WARN ERROR FATAL OFF]. Messages at the specified level (and at more severe levels) are ingested. For more information, see [LOG_LEVEL](https://docs.snowflake.com/en/sql-reference/parameters.html#label-log-level).",
+    "max_data_extension_time_in_days": "Object parameter that specifies the maximum number of days for which Snowflake can extend the data retention period for tables in the database to prevent streams on the tables from becoming stale. For a detailed description of this parameter, see [MAX*DATA*EXTENSION*TIME*IN_DAYS](https://docs.snowflake.com/en/sql-reference/parameters.html#label-max-data-extension-time-in-days).",
+    "name": "Specifies the identifier for the database; must be unique for your account. As a best practice for [Database Replication and Failover](https://docs.snowflake.com/en/user-guide/db-replication-intro), it is recommended to give each secondary database the same name as its primary database. This practice supports referencing fully-qualified objects (i.e. '\\n\\n.\\n\\n.\\n\\n') by other objects in the same database, such as querying a fully-qualified table name in a view. If a secondary database has a different name from the primary database, then these object references would break in the secondary database.",
+    "quoted_identifiers_ignore_case": "If true, the case of quoted identifiers is ignored. For more information, see [QUOTED*IDENTIFIERS*IGNORE_CASE](https://docs.snowflake.com/en/sql-reference/parameters#quoted-identifiers-ignore-case).",
+    "replace_invalid_characters": "Specifies whether to replace invalid UTF-8 characters with the Unicode replacement character (�) in query results for an Iceberg table. You can only set this parameter for tables that use an external Iceberg catalog. For more information, see [REPLACE*INVALID*CHARACTERS](https://docs.snowflake.com/en/sql-reference/parameters#replace-invalid-characters).",
+    "replication": "Configures replication for a given database. When specified, this database will be promoted to serve as a primary database for replication. A primary database can be replicated in one or more accounts, allowing users in those accounts to query objects in each secondary (i.e. replica) database.",
+    "storage_serialization_policy": "The storage serialization policy for Iceberg tables that use Snowflake as the catalog. Valid options are: [COMPATIBLE OPTIMIZED]. COMPATIBLE: Snowflake performs encoding and compression of data files that ensures interoperability with third-party compute engines. OPTIMIZED: Snowflake performs encoding and compression of data files that ensures the best table performance within Snowflake. For more information, see [STORAGE*SERIALIZATION*POLICY](https://docs.snowflake.com/en/sql-reference/parameters#storage-serialization-policy).",
+    "suspend_task_after_num_failures": "How many times a task must fail in a row before it is automatically suspended. 0 disables auto-suspending. For more information, see [SUSPEND*TASK*AFTER*NUM*FAILURES](https://docs.snowflake.com/en/sql-reference/parameters#suspend-task-after-num-failures).",
+    "task_auto_retry_attempts": "Maximum automatic retries allowed for a user task. For more information, see [TASK*AUTO*RETRY_ATTEMPTS](https://docs.snowflake.com/en/sql-reference/parameters#task-auto-retry-attempts).",
+    "trace_level": "Controls how trace events are ingested into the event table. Valid options are: [ALWAYS ON*EVENT OFF]. For information about levels, see [TRACE*LEVEL](https://docs.snowflake.com/en/sql-reference/parameters.html#label-trace-level).",
+    "user_task_managed_initial_warehouse_size": "The initial size of warehouse to use for managed warehouses in the absence of history. For more information, see [USER*TASK*MANAGED*INITIAL*WAREHOUSE_SIZE](https://docs.snowflake.com/en/sql-reference/parameters#user-task-managed-initial-warehouse-size).",
+    "user_task_minimum_trigger_interval_in_seconds": "Minimum amount of time between Triggered Task executions in seconds.",
+    "user_task_timeout_ms": "User task execution timeout in milliseconds. For more information, see [USER*TASK*TIMEOUT_MS](https://docs.snowflake.com/en/sql-reference/parameters#user-task-timeout-ms)."
+  },
+  "Import": "## Import\n\n```sh\n$ pulumi import MISSING_TOK example 'database_name'\n```\n\n"
+}