cli-docs-tool/markdown.go

// Copyright 2017 cli-docs-tool authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package clidocstool

import (
	"regexp"
	"strings"
	"unicode"
)

var (
	// mdHeading matches MarkDown H1..h6 headings. Note that this regex may produce
	// false positives for (e.g.) comments in code-blocks (# this is a comment),
	// so should not be used as a generic regex for other purposes.
	mdHeading = regexp.MustCompile(`^([#]{1,6})\s(.*)$`)
	// htmlAnchor matches inline HTML anchors. This is intended to only match anchors
	// for our use-case; DO NOT consider using this as a generic regex, or at least
	// not before reading https://stackoverflow.com/a/1732454/1811501.
	htmlAnchor = regexp.MustCompile(`<a\s+(?:name|id)="?([^"]+)"?\s*></a>\s*`)
	// relativeLink matches parts of internal links between .md documents
	// e.g. "](buildx_build.md)"
	relativeLink = regexp.MustCompile(`\]\((\.\/)?[a-z-_]+\.md(#.*)?\)`)
)

// getSections returns all H2 sections by title (lowercase)
func getSections(mdString string) map[string]string {
	parsedContent := strings.Split("\n"+mdString, "\n## ")
	sections := make(map[string]string, len(parsedContent))
	for _, s := range parsedContent {
		if strings.HasPrefix(s, "#") {
			// not a H2 Section
			continue
		}
		parts := strings.SplitN(s, "\n", 2)
		if len(parts) == 2 {
			sections[strings.ToLower(parts[0])] = parts[1]
		}
	}
	return sections
}

// cleanupMarkDown cleans up the MarkDown passed in mdString for inclusion in
// YAML. It removes trailing whitespace and substitutes tabs for four spaces
// to prevent YAML switching to use "compact" form; ("line1  \nline\t2\n")
// which, although equivalent, is hard to read.
func cleanupMarkDown(mdString string) (md string, anchors []string) {
	// remove leading/trailing whitespace, and replace tabs in the whole content
	mdString = strings.TrimSpace(mdString)
	mdString = strings.ReplaceAll(mdString, "\t", "    ")
	mdString = strings.ReplaceAll(mdString, "https://docs.docker.com", "")

	// Rewrite internal links, replacing relative paths with absolute path
	// e.g. from [docker buildx build](buildx_build.md#build-arg)
	// to [docker buildx build](/reference/cli/docker/buildx/build/#build-arg)
	mdString = relativeLink.ReplaceAllStringFunc(mdString, func(link string) string {
		link = strings.TrimLeft(link, "](./")
		link = strings.ReplaceAll(link, "_", "/")
		link = strings.ReplaceAll(link, ".md", "/")
		return "](/reference/cli/docker/" + link
	})

	var id string
	// replace trailing whitespace per line, and handle custom anchors
	lines := strings.Split(mdString, "\n")
	for i := 0; i < len(lines); i++ {
		lines[i] = strings.TrimRightFunc(lines[i], unicode.IsSpace)
		lines[i], id = convertHTMLAnchor(lines[i])
		if id != "" {
			anchors = append(anchors, id)
		}
	}
	return strings.Join(lines, "\n"), anchors
}

// convertHTMLAnchor converts inline anchor-tags in headings (<a name=myanchor></a>)
// to an extended-markdown property ({#myanchor}). Extended Markdown properties
// are not supported in GitHub Flavored Markdown, but are supported by Jekyll,
// and lead to cleaner HTML in our docs, and prevents duplicate anchors.
// It returns the converted MarkDown heading and the custom ID (if present)
func convertHTMLAnchor(mdLine string) (md string, customID string) {
	if m := mdHeading.FindStringSubmatch(mdLine); len(m) > 0 {
		if a := htmlAnchor.FindStringSubmatch(m[2]); len(a) > 0 {
			customID = a[1]
			mdLine = m[1] + " " + htmlAnchor.ReplaceAllString(m[2], "") + " {#" + customID + "}"
		}
	}
	return mdLine, customID
}
Rename project Signed-off-by: CrazyMax <crazy-max@users.noreply.github.com> 2021-08-20 16:55:17 +03:00			`// Copyright 2017 cli-docs-tool authors`
Add license headers Signed-off-by: CrazyMax <crazy-max@users.noreply.github.com> 2021-08-10 01:06:18 +03:00			`//`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`//`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`

Rename project Signed-off-by: CrazyMax <crazy-max@users.noreply.github.com> 2021-08-20 16:55:17 +03:00			`package clidocstool`
yamldocs: various improvements - make sure the target directory is created if missing - add support for custom ID's in headings through `<a>` tags (e.g. `<a name=heading2></a>`). This allows use of custom anchors that work both on GitHub (GFM doesn't support extended MarkDown), and in Jekyll (which does). - add code to cleanup markdown for use in our docs: - remove absolute URLs to https://docs.docker.com - remove tabs in MarkDown, and convert them to 4 spaces. This prevents the YAML conversion from switching between "short" and "long" syntax. Tabs in code examples also don't always work well, so using spaces doesn't hurt for that. - refactor some code for readability, and to be less "hacky" (still lots to be improved though) Signed-off-by: Sebastiaan van Stijn <github@gone.nl> 2021-04-01 00:36:03 +03:00
			`import (`
			`"regexp"`
			`"strings"`
			`"unicode"`
			`)`

			`var (`
			`// mdHeading matches MarkDown H1..h6 headings. Note that this regex may produce`
			`// false positives for (e.g.) comments in code-blocks (# this is a comment),`
			`// so should not be used as a generic regex for other purposes.`
			mdHeading = regexp.MustCompile(`^([#]{1,6})\s(.*)$`)
			`// htmlAnchor matches inline HTML anchors. This is intended to only match anchors`
			`// for our use-case; DO NOT consider using this as a generic regex, or at least`
			`// not before reading https://stackoverflow.com/a/1732454/1811501.`
			htmlAnchor = regexp.MustCompile(`<a\s+(?:name\|id)="?([^"]+)"?\s></a>\s`)
yaml: add processing for internal links, and tests Signed-off-by: David Karlsson <35727626+dvdksn@users.noreply.github.com> 2024-02-11 23:51:55 +03:00			`// relativeLink matches parts of internal links between .md documents`
			`// e.g. "](buildx_build.md)"`
			relativeLink = regexp.MustCompile(`\]\((\.\/)?[a-z-_]+\.md(#.*)?\)`)
yamldocs: various improvements - make sure the target directory is created if missing - add support for custom ID's in headings through `<a>` tags (e.g. `<a name=heading2></a>`). This allows use of custom anchors that work both on GitHub (GFM doesn't support extended MarkDown), and in Jekyll (which does). - add code to cleanup markdown for use in our docs: - remove absolute URLs to https://docs.docker.com - remove tabs in MarkDown, and convert them to 4 spaces. This prevents the YAML conversion from switching between "short" and "long" syntax. Tabs in code examples also don't always work well, so using spaces doesn't hurt for that. - refactor some code for readability, and to be less "hacky" (still lots to be improved though) Signed-off-by: Sebastiaan van Stijn <github@gone.nl> 2021-04-01 00:36:03 +03:00			`)`

			`// getSections returns all H2 sections by title (lowercase)`
			`func getSections(mdString string) map[string]string {`
			`parsedContent := strings.Split("\n"+mdString, "\n## ")`
			`sections := make(map[string]string, len(parsedContent))`
			`for _, s := range parsedContent {`
			`if strings.HasPrefix(s, "#") {`
			`// not a H2 Section`
			`continue`
			`}`
			`parts := strings.SplitN(s, "\n", 2)`
			`if len(parts) == 2 {`
			`sections[strings.ToLower(parts[0])] = parts[1]`
			`}`
			`}`
			`return sections`
			`}`

			`// cleanupMarkDown cleans up the MarkDown passed in mdString for inclusion in`
			`// YAML. It removes trailing whitespace and substitutes tabs for four spaces`
			`// to prevent YAML switching to use "compact" form; ("line1 \nline\t2\n")`
			`// which, although equivalent, is hard to read.`
			`func cleanupMarkDown(mdString string) (md string, anchors []string) {`
			`// remove leading/trailing whitespace, and replace tabs in the whole content`
			`mdString = strings.TrimSpace(mdString)`
			`mdString = strings.ReplaceAll(mdString, "\t", " ")`
			`mdString = strings.ReplaceAll(mdString, "https://docs.docker.com", "")`

yaml: add processing for internal links, and tests Signed-off-by: David Karlsson <35727626+dvdksn@users.noreply.github.com> 2024-02-11 23:51:55 +03:00			`// Rewrite internal links, replacing relative paths with absolute path`
			`// e.g. from [docker buildx build](buildx_build.md#build-arg)`
			`// to [docker buildx build](/reference/cli/docker/buildx/build/#build-arg)`
			`mdString = relativeLink.ReplaceAllStringFunc(mdString, func(link string) string {`
			`link = strings.TrimLeft(link, "](./")`
			`link = strings.ReplaceAll(link, "_", "/")`
			`link = strings.ReplaceAll(link, ".md", "/")`
			`return "](/reference/cli/docker/" + link`
			`})`

yamldocs: various improvements - make sure the target directory is created if missing - add support for custom ID's in headings through `<a>` tags (e.g. `<a name=heading2></a>`). This allows use of custom anchors that work both on GitHub (GFM doesn't support extended MarkDown), and in Jekyll (which does). - add code to cleanup markdown for use in our docs: - remove absolute URLs to https://docs.docker.com - remove tabs in MarkDown, and convert them to 4 spaces. This prevents the YAML conversion from switching between "short" and "long" syntax. Tabs in code examples also don't always work well, so using spaces doesn't hurt for that. - refactor some code for readability, and to be less "hacky" (still lots to be improved though) Signed-off-by: Sebastiaan van Stijn <github@gone.nl> 2021-04-01 00:36:03 +03:00			`var id string`
			`// replace trailing whitespace per line, and handle custom anchors`
			`lines := strings.Split(mdString, "\n")`
			`for i := 0; i < len(lines); i++ {`
			`lines[i] = strings.TrimRightFunc(lines[i], unicode.IsSpace)`
			`lines[i], id = convertHTMLAnchor(lines[i])`
			`if id != "" {`
			`anchors = append(anchors, id)`
			`}`
			`}`
			`return strings.Join(lines, "\n"), anchors`
			`}`

			`// convertHTMLAnchor converts inline anchor-tags in headings (<a name=myanchor></a>)`
			`// to an extended-markdown property ({#myanchor}). Extended Markdown properties`
			`// are not supported in GitHub Flavored Markdown, but are supported by Jekyll,`
			`// and lead to cleaner HTML in our docs, and prevents duplicate anchors.`
			`// It returns the converted MarkDown heading and the custom ID (if present)`
			`func convertHTMLAnchor(mdLine string) (md string, customID string) {`
			`if m := mdHeading.FindStringSubmatch(mdLine); len(m) > 0 {`
			`if a := htmlAnchor.FindStringSubmatch(m[2]); len(a) > 0 {`
			`customID = a[1]`
			`mdLine = m[1] + " " + htmlAnchor.ReplaceAllString(m[2], "") + " {#" + customID + "}"`
			`}`
			`}`
			`return mdLine, customID`
			`}`