code.go

// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Adapted for mmark, by Miek Gieben, 2015.

package mmark

import (
	"bufio"
	"bytes"
	"errors"
	"io/ioutil"
	"regexp"
	"strconv"
	"unicode/utf8"
)

// SourceCodeTypes are the different languages that are supported as
// a type attribute in sourcecode, see Section 2.48.4 of XML2RFC v3 (-21).
var SourceCodeTypes = map[string]bool{
	"abnf":       true,
	"asn.1":      true,
	"bash":       true,
	"c++":        true,
	"c":          true,
	"cbor":       true,
	"dtd":        true,
	"java":       true,
	"javascript": true,
	"json":       true,
	"mib":        true,
	"perl":       true,
	"pseudocode": true,
	"python":     true,
	"rnc":        true,
	"xml":        true,

	"go": true,
}

// parseAddress parses a code address directive and returns the bytes.
func parseAddress(addr []byte, file []byte) []byte {
	bytes.TrimSpace(addr)

	textBytes, err := ioutil.ReadFile(string(file))
	if err != nil {
		printf(nil, "failed: `%s': %s", string(file), err)
		return nil
	}

	lo, hi, err := addrToByteRange(string(addr), 0, textBytes)
	if err != nil {
		printf(nil, "code include address: %s", err.Error())
		return textBytes
	}

	// Acme pattern matches can stop mid-line,
	// so run to end of line in both directions if not at line start/end.
	for lo > 0 && textBytes[lo-1] != '\n' {
		lo--
	}
	if hi > 0 {
		for hi < len(textBytes) && textBytes[hi-1] != '\n' {
			hi++
		}
	}

	lines := codeLines(textBytes, lo, hi)
	return lines
}

// codeLines takes a source file and returns the lines that
// span the byte range specified by start and end.
// It discards lines that end in "OMIT" and in "OMIT -->"
func codeLines(src []byte, start, end int) (lines []byte) {
	startLine := 1
	for i, b := range src {
		if i == start {
			break
		}
		if b == '\n' {
			startLine++
		}
	}
	s := bufio.NewScanner(bytes.NewReader(src[start:end]))
	for n := startLine; s.Scan(); n++ {
		l := s.Bytes()
		if bytes.HasSuffix(l, []byte("OMIT")) {
			continue
		}
		if bytes.HasSuffix(l, []byte("OMIT -->")) {
			continue
		}
		lines = append(lines, l...)
		lines = append(lines, '\n')
	}
	// TODO(miek): trim leading and trailing blanklines
	return
}

// This file is stolen from go/src/cmd/godoc/codewalk.go.
// It's an evaluator for the file address syntax implemented by acme and sam,
// but using Go-native regular expressions.
// To keep things reasonably close, this version uses (?m:re) for all user-provided
// regular expressions. That is the only change to the code from codewalk.go.
// See http://plan9.bell-labs.com/sys/doc/sam/sam.html Table II
// for details on the syntax.

// addrToByte evaluates the given address starting at offset start in data.
// It returns the lo and hi byte offset of the matched region within data.
func addrToByteRange(addr string, start int, data []byte) (lo, hi int, err error) {
	if addr == "" {
		lo, hi = start, len(data)
		return
	}
	var (
		dir        byte
		prevc      byte
		charOffset bool
	)
	lo = start
	hi = start
	for addr != "" && err == nil {
		c := addr[0]
		switch c {
		default:
			err = errors.New("invalid address syntax near " + string(c))
		case ',':
			if len(addr) == 1 {
				hi = len(data)
			} else {
				_, hi, err = addrToByteRange(addr[1:], hi, data)
			}
			return

		case '+', '-':
			if prevc == '+' || prevc == '-' {
				lo, hi, err = addrNumber(data, lo, hi, prevc, 1, charOffset)
			}
			dir = c

		case '$':
			lo = len(data)
			hi = len(data)
			if len(addr) > 1 {
				dir = '+'
			}

		case '#':
			charOffset = true

		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
			var i int
			for i = 1; i < len(addr); i++ {
				if addr[i] < '0' || addr[i] > '9' {
					break
				}
			}
			var n int
			n, err = strconv.Atoi(addr[0:i])
			if err != nil {
				break
			}
			lo, hi, err = addrNumber(data, lo, hi, dir, n, charOffset)
			dir = 0
			charOffset = false
			prevc = c
			addr = addr[i:]
			continue

		case '/':
			var i, j int
		Regexp:
			for i = 1; i < len(addr); i++ {
				switch addr[i] {
				case '\\':
					i++
				case '/':
					j = i + 1
					break Regexp
				}
			}
			if j == 0 {
				j = i
			}
			pattern := addr[1:i]
			lo, hi, err = addrRegexp(data, lo, hi, dir, pattern)
			prevc = c
			addr = addr[j:]
			continue
		}
		prevc = c
		addr = addr[1:]
	}

	if err == nil && dir != 0 {
		lo, hi, err = addrNumber(data, lo, hi, dir, 1, charOffset)
	}
	if err != nil {
		return 0, 0, err
	}
	return lo, hi, nil
}

// addrNumber applies the given dir, n, and charOffset to the address lo, hi.
// dir is '+' or '-', n is the count, and charOffset is true if the syntax
// used was #n.  Applying +n (or +#n) means to advance n lines
// (or characters) after hi.  Applying -n (or -#n) means to back up n lines
// (or characters) before lo.
// The return value is the new lo, hi.
func addrNumber(data []byte, lo, hi int, dir byte, n int, charOffset bool) (int, int, error) {
	switch dir {
	case 0:
		lo = 0
		hi = 0
		fallthrough

	case '+':
		if charOffset {
			pos := hi
			for ; n > 0 && pos < len(data); n-- {
				_, size := utf8.DecodeRune(data[pos:])
				pos += size
			}
			if n == 0 {
				return pos, pos, nil
			}
			break
		}
		// find next beginning of line
		if hi > 0 {
			for hi < len(data) && data[hi-1] != '\n' {
				hi++
			}
		}
		lo = hi
		if n == 0 {
			return lo, hi, nil
		}
		for ; hi < len(data); hi++ {
			if data[hi] != '\n' {
				continue
			}
			switch n--; n {
			case 1:
				lo = hi + 1
			case 0:
				return lo, hi + 1, nil
			}
		}

	case '-':
		if charOffset {
			// Scan backward for bytes that are not UTF-8 continuation bytes.
			pos := lo
			for ; pos > 0 && n > 0; pos-- {
				if data[pos]&0xc0 != 0x80 {
					n--
				}
			}
			if n == 0 {
				return pos, pos, nil
			}
			break
		}
		// find earlier beginning of line
		for lo > 0 && data[lo-1] != '\n' {
			lo--
		}
		hi = lo
		if n == 0 {
			return lo, hi, nil
		}
		for ; lo >= 0; lo-- {
			if lo > 0 && data[lo-1] != '\n' {
				continue
			}
			switch n--; n {
			case 1:
				hi = lo
			case 0:
				return lo, hi, nil
			}
		}
	}

	return 0, 0, errors.New("address out of range")
}

// addrRegexp searches for pattern in the given direction starting at lo, hi.
// The direction dir is '+' (search forward from hi) or '-' (search backward from lo).
// Backward searches are unimplemented.
func addrRegexp(data []byte, lo, hi int, dir byte, pattern string) (int, int, error) {
	// We want ^ and $ to work as in sam/acme, so use ?m.
	re, err := regexp.Compile("(?m:" + pattern + ")")
	if err != nil {
		return 0, 0, err
	}
	if dir == '-' {
		// Could implement reverse search using binary search
		// through file, but that seems like overkill.
		return 0, 0, errors.New("reverse search not implemented")
	}
	m := re.FindIndex(data[hi:])
	if len(m) > 0 {
		m[0] += hi
		m[1] += hi
	} else if hi > 0 {
		// No match.  Wrap to beginning of data.
		m = re.FindIndex(data)
	}
	if len(m) == 0 {
		return 0, 0, errors.New("no match for " + pattern)
	}
	return m[0], m[1], nil
}