Merge pull request #15 from wjdp/document-store

Check internal hashes (Implement document store)
wjdp · Nov 13, 2016 · 22d855b · 22d855b
2 parents 1f8a385 + 1e5bde5
commit 22d855b
Show file tree

Hide file tree

Showing 41 changed files with 725 additions and 232 deletions.
diff --git a/README.md b/README.md
@@ -44,13 +44,12 @@ Options:
 Many options of the following tests can customised. Items marked :soon: are not checked yet, but will be *soon*.
 
 - `a` `link` `img` `script`: Whether internal links work / are valid.
-- `a`: :soon: Whether internal hashes work.
+- `a`: Whether internal hashes work.
 - `a` `link` `img` `script`: Whether external links work.
 - `a`: :soon: Whether external hashes work.
 - `a` `link`: Whether external links use HTTPS.
-- `a` `link`: Whether external links use HTTPS.
 - `img`: Whether your images have valid alt attributes.
-- `meta`: :soon: Whether favicons are valid.
+- `link`: Whether pages have a valid favicon.
 - `meta`: :soon: Whether images and URLs in the OpenGraph metadata are valid.
 - `meta` `title`: :soon: Whether you've got the [recommended tags](https://support.google.com/webmasters/answer/79812?hl=en) in your head.
 
@@ -90,12 +89,13 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo
 | `CheckScripts` | Enables checking `<script…` tags. | `true` |
 | `CheckExternal` | Enables external reference checking; all tag types. | `true` |
 | `CheckInternal` | Enables internal reference checking; all tag types. | `true` |
+| `CheckInternalHash` | Enables internal hash/fragment checking. | `true` |
 | `CheckMailto` | Enables–albeit quite basic–`mailto:` link checking. | `true` |
 | `CheckTel` | Enables–albeit quite basic–`tel:` link checking. | `true` |
 | `CheckFavicon` | Enables favicon checking, ensures every page has a favicon set. | `false` |
 | `EnforceHTTPS` | Fails when encountering an `http://` link. Useful to prevent mixed content errors when serving over HTTPS. | `false` |
-| `IgnoreURLs` | Array of strings or regexs of URLs to ignore. | empty |
-| `IgnoreDirs` | Array of strings or regexs of directories to ignore when scanning for HTML files. | empty |
+| `IgnoreURLs` | Array of regexs of URLs to ignore. | empty |
+| `IgnoreDirs` | Array of regexs of directories to ignore when scanning for HTML files. | empty |
 | `IgnoreCanonicalBrokenLinks` | When true produces a warning, rather than an error for broken canonical links. When testing a site which isn't live yet or before publishing a new page canonical links will fail. | `true` |
 | `IgnoreAltMissing` | Turns off image alt attribute checking. | `false` |
 | `IgnoreDirectoryMissingTrailingSlash` | Turns off errors for links to directories without a trailing slash. | `false` |

diff --git a/htmltest/attr.go → htmldoc/attr.go b/htmltest/attr.go → htmldoc/attr.go
@@ -1,10 +1,10 @@
-package htmltest
+package htmldoc
 
 import (
 	"golang.org/x/net/html"
 )
 
-func extractAttrs(attrs []html.Attribute, keys []string) map[string]string {
+func ExtractAttrs(attrs []html.Attribute, keys []string) map[string]string {
 	attrMap := make(map[string]string)
 	for _, attr := range attrs {
 		for i, key := range keys {
@@ -18,11 +18,22 @@ func extractAttrs(attrs []html.Attribute, keys []string) map[string]string {
 	return attrMap
 }
 
-func attrPresent(attrs []html.Attribute, key string) bool {
+func AttrPresent(attrs []html.Attribute, key string) bool {
 	for _, attr := range attrs {
 		if attr.Key == key {
 			return true
 		}
 	}
 	return false
 }
+
+func GetId(attrs []html.Attribute) string {
+	for _, attr := range attrs {
+		if attr.Key == "id" {
+			return attr.Val
+		} else if attr.Key == "name" {
+			return attr.Val
+		}
+	}
+	return ""
+}
diff --git a/htmldoc/attr_test.go b/htmldoc/attr_test.go
@@ -0,0 +1,45 @@
+package htmldoc
+
+import (
+	"github.com/daviddengcn/go-assert"
+	"golang.org/x/net/html"
+	"strings"
+	"testing"
+)
+
+func TestExtractAttrs(t *testing.T) {
+	snip := "<img src=\"x\" alt=\"y\" />"
+	nodeDoc, _ := html.Parse(strings.NewReader(snip))
+	nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
+	attrs := ExtractAttrs(nodeImg.Attr, []string{"src", "alt"})
+
+	assert.Equals(t, "src", attrs["src"], "x")
+	assert.Equals(t, "alt", attrs["alt"], "y")
+	assert.NotEquals(t, "foo", attrs["foo"], "bar")
+}
+
+func TestAttrPresent(t *testing.T) {
+	snip := "<img src=\"x\" alt=\"y\" />"
+	nodeDoc, _ := html.Parse(strings.NewReader(snip))
+	nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
+
+	assert.Equals(t, "src in attr", AttrPresent(nodeImg.Attr, "src"), true)
+	assert.Equals(t, "alt in attr", AttrPresent(nodeImg.Attr, "src"), true)
+	assert.NotEquals(t, "foo in attr", AttrPresent(nodeImg.Attr, "src"), false)
+}
+
+func TestAttrValIdId(t *testing.T) {
+	snip := "<h1 id=\"x\" />"
+	nodeDoc, _ := html.Parse(strings.NewReader(snip))
+	nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
+
+	assert.Equals(t, "h1 id", GetId(nodeH1.Attr), "x")
+}
+
+func TestAttrValIdName(t *testing.T) {
+	snip := "<h1 name=\"x\" />"
+	nodeDoc, _ := html.Parse(strings.NewReader(snip))
+	nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
+
+	assert.Equals(t, "h1 name", GetId(nodeH1.Attr), "x")
+}
diff --git a/htmldoc/document.go b/htmldoc/document.go
@@ -3,24 +3,42 @@ package htmldoc
 import (
 	"golang.org/x/net/html"
 	"os"
-	"path"
-	"regexp"
+	"sync"
 )
 
 type Document struct {
-	FilePath  string // Relative to the shell session
-	SitePath  string // Relative to the site root
-	Directory string
-	HTMLNode  *html.Node
-	State     DocumentState
+	FilePath        string // Relative to the shell session
+	SitePath        string // Relative to the site root
+	Directory       string
+	htmlMutex       *sync.Mutex
+	htmlNode        *html.Node
+	hashMap         map[string]*html.Node
+	NodesOfInterest []*html.Node
+	State           DocumentState
 }
 
 // Used by checks that depend on the document being parsed
 type DocumentState struct {
 	FaviconPresent bool
 }
 
+func (doc *Document) Init() {
+	// Setup the document, doesn't mesh nice with the NewXYZ() convention but
+	// many optional parameters for Document and no parameter overloading in Go
+	doc.htmlMutex = &sync.Mutex{}
+	doc.NodesOfInterest = make([]*html.Node, 0)
+	doc.hashMap = make(map[string]*html.Node)
+}
+
 func (doc *Document) Parse() {
+	// Parse the document
+	// Either called when the document is tested or when another document needs
+	// data from this one.
+	doc.htmlMutex.Lock() // MUTEX
+	if doc.htmlNode != nil {
+		doc.htmlMutex.Unlock() // MUTEX
+		return
+	}
 	// Open, parse, and close document
 	f, err := os.Open(doc.FilePath)
 	checkErr(err)
@@ -29,73 +47,34 @@ func (doc *Document) Parse() {
 	htmlNode, err := html.Parse(f)
 	checkErr(err)
 
-	doc.HTMLNode = htmlNode
-}
-
-func DocumentsFromDir(path string, ignorePatterns []interface{}) []Document {
-	// Nice proxy for recurseDir
-	return recurseDir(path, ignorePatterns, "")
+	doc.htmlNode = htmlNode
+	doc.parseNode(htmlNode)
+	doc.htmlMutex.Unlock() // MUTEX
 }
 
-func recurseDir(basePath string, ignorePatterns []interface{}, dPath string) []Document {
-	// Recursive function that returns all Document struts in a given
-	// os directory.
-	// basePath: the directory to scan
-	// dPath: the subdirectory within basePath we're scanning
-	// ignorePatterns: string slice of dPaths to ignore
-
-	documents := make([]Document, 0)
-
-	if isDirIgnored(ignorePatterns, dPath) {
-		return documents
-	}
-
-	// Open directory to scan
-	f, err := os.Open(path.Join(basePath, dPath))
-	checkErr(err)
-	defer f.Close()
-
-	// Get FileInfo of directory (scan it)
-	fi, err := f.Stat()
-	checkErr(err)
-
-	if fi.IsDir() { // Double check we're dealing with a directory
-		// Read all FileInfo-s from directory, Readdir(count int)
-		fis, err := f.Readdir(-1)
-		checkErr(err)
-
-		// Iterate over contents of directory
-		for _, fileinfo := range fis {
-			fPath := path.Join(dPath, fileinfo.Name())
-			if fileinfo.IsDir() {
-				// If item is a dir, we need to iterate further, save returned documents
-				documents = append(documents, recurseDir(basePath, ignorePatterns, fPath)...)
-			} else if path.Ext(fileinfo.Name()) == ".html" || path.Ext(fileinfo.Name()) == ".htm" {
-				// If a file, save to filename list
-				documents = append(documents, Document{
-					FilePath:  path.Join(basePath, fPath),
-					SitePath:  fPath,
-					Directory: dPath,
-				})
-			}
+func (doc *Document) parseNode(n *html.Node) {
+	if n.Type == html.ElementNode {
+		// If present save fragment identifier to the hashMap
+		nodeId := GetId(n.Attr)
+		if nodeId != "" {
+			doc.hashMap[nodeId] = n
+		}
+		// Identify and store tags of interest
+		switch n.Data {
+		case "a", "link", "img", "script":
+			doc.NodesOfInterest = append(doc.NodesOfInterest, n)
+		case "pre", "code":
+			return // Everything within these elements is not to be interpreted
 		}
-	} else { // It's a file, return single file
-		filePath := path.Join(basePath, dPath)
-		documents = append(documents, Document{
-			FilePath:  filePath,
-			SitePath:  path.Base(filePath),
-			Directory: dPath,
-		})
 	}
-
-	return documents
+	// Iterate over children
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		doc.parseNode(c)
+	}
 }
 
-func isDirIgnored(ignorePatterns []interface{}, dir string) bool {
-	for _, item := range ignorePatterns {
-		if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok {
-			return true
-		}
-	}
-	return false
+func (doc *Document) IsHashValid(hash string) bool {
+	doc.Parse() // Ensure doc has been parsed
+	_, ok := doc.hashMap[hash]
+	return ok
 }
diff --git a/htmldoc/document_store.go b/htmldoc/document_store.go
@@ -0,0 +1,117 @@
+package htmldoc
+
+import (
+	"os"
+	"path"
+	"regexp"
+)
+
+type DocumentStore struct {
+	BasePath          string               // Path, relative to cwd, the site is located in
+	IgnorePatterns    []interface{}        // Regexes of directories to ignore
+	Documents         []*Document          // All of the documents, used to iterate over
+	DocumentPathMap   map[string]*Document // Maps slash separated paths to documents
+	DocumentExtension string               // File extension to look for
+	DirectoryIndex    string               // What file is the index of the directory
+}
+
+func NewDocumentStore() DocumentStore {
+	return DocumentStore{
+		Documents:       make([]*Document, 0),
+		DocumentPathMap: make(map[string]*Document),
+	}
+}
+
+func (dS *DocumentStore) AddDocument(doc *Document) {
+	// Save reference to document to various data stores
+	dS.Documents = append(dS.Documents, doc)
+	dS.DocumentPathMap[doc.SitePath] = doc
+}
+
+func (dS *DocumentStore) Discover() {
+	// Find all documents in BasePath
+	dS.discoverRecurse(".")
+}
+
+func (dS *DocumentStore) isDirIgnored(dir string) bool {
+	// Does path dir match IgnorePatterns?
+	for _, item := range dS.IgnorePatterns {
+		if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok {
+			return true
+		}
+	}
+	return false
+}
+
+func (dS *DocumentStore) discoverRecurse(dPath string) {
+	// Recurse over relative path dPath, saves found documents to dS
+	if dS.isDirIgnored(dPath) {
+		return
+	}
+
+	// Open directory to scan
+	f, err := os.Open(path.Join(dS.BasePath, dPath))
+	checkErr(err)
+	defer f.Close()
+
+	// Get FileInfo of directory (scan it)
+	fi, err := f.Stat()
+	checkErr(err)
+
+	if fi.IsDir() { // Double check we're dealing with a directory
+		// Read all FileInfo-s from directory, Readdir(count int)
+		fis, err := f.Readdir(-1)
+		checkErr(err)
+
+		// Iterate over contents of directory
+		for _, fileinfo := range fis {
+			fPath := path.Join(dPath, fileinfo.Name())
+			if fileinfo.IsDir() {
+				// If item is a dir, we delve deeper
+				dS.discoverRecurse(fPath)
+			} else if path.Ext(fileinfo.Name()) == dS.DocumentExtension {
+				// If a file, create and save document
+				newDoc := &Document{
+					FilePath:  path.Join(dS.BasePath, fPath),
+					SitePath:  fPath,
+					Directory: dPath,
+				}
+				newDoc.Init()
+				dS.AddDocument(newDoc)
+			}
+		}
+	} else { // It's a file, return single file
+		panic("discoverRecurse encountered a file: " + dPath)
+	}
+
+}
+
+func (dS *DocumentStore) ResolvePath(refPath string) (*Document, bool) {
+	// Resolves internal absolute paths to documents
+
+	// Match root document
+	if refPath == "/" {
+		d0, b0 := dS.DocumentPathMap[dS.DirectoryIndex]
+		return d0, b0
+	}
+
+	if refPath[0] == '/' && len(refPath) > 1 {
+		// Is an absolute link, remove the leading slash for map lookup
+		refPath = refPath[1:len(refPath)]
+	}
+
+	// Try path as-is, path.ext
+	d1, b1 := dS.DocumentPathMap[refPath]
+	if b1 {
+		// as-is worked, return that
+		return d1, b1
+	}
+
+	// Try as a directory, path.ext/index.html
+	d2, b2 := dS.DocumentPathMap[path.Join(refPath, dS.DirectoryIndex)]
+	return d2, b2
+}
+
+func (dS *DocumentStore) ResolveRef(ref *Reference) (*Document, bool) {
+	return dS.ResolvePath(ref.RefSitePath())
+}