Skip to content

Commit

Permalink
Merge pull request #15 from wjdp/document-store
Browse files Browse the repository at this point in the history
Check internal hashes (Implement document store)
  • Loading branch information
wjdp authored Nov 13, 2016
2 parents 1f8a385 + 1e5bde5 commit 22d855b
Show file tree
Hide file tree
Showing 41 changed files with 725 additions and 232 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,12 @@ Options:
Many options of the following tests can customised. Items marked :soon: are not checked yet, but will be *soon*.

- `a` `link` `img` `script`: Whether internal links work / are valid.
- `a`: :soon: Whether internal hashes work.
- `a`: Whether internal hashes work.
- `a` `link` `img` `script`: Whether external links work.
- `a`: :soon: Whether external hashes work.
- `a` `link`: Whether external links use HTTPS.
- `a` `link`: Whether external links use HTTPS.
- `img`: Whether your images have valid alt attributes.
- `meta`: :soon: Whether favicons are valid.
- `link`: Whether pages have a valid favicon.
- `meta`: :soon: Whether images and URLs in the OpenGraph metadata are valid.
- `meta` `title`: :soon: Whether you've got the [recommended tags](https://support.google.com/webmasters/answer/79812?hl=en) in your head.

Expand Down Expand Up @@ -90,12 +89,13 @@ htmltest uses a YAML configuration file. Put `.htmltest.yml` in the same directo
| `CheckScripts` | Enables checking `<script…` tags. | `true` |
| `CheckExternal` | Enables external reference checking; all tag types. | `true` |
| `CheckInternal` | Enables internal reference checking; all tag types. | `true` |
| `CheckInternalHash` | Enables internal hash/fragment checking. | `true` |
| `CheckMailto` | Enables–albeit quite basic–`mailto:` link checking. | `true` |
| `CheckTel` | Enables–albeit quite basic–`tel:` link checking. | `true` |
| `CheckFavicon` | Enables favicon checking, ensures every page has a favicon set. | `false` |
| `EnforceHTTPS` | Fails when encountering an `http://` link. Useful to prevent mixed content errors when serving over HTTPS. | `false` |
| `IgnoreURLs` | Array of strings or regexs of URLs to ignore. | empty |
| `IgnoreDirs` | Array of strings or regexs of directories to ignore when scanning for HTML files. | empty |
| `IgnoreURLs` | Array of regexs of URLs to ignore. | empty |
| `IgnoreDirs` | Array of regexs of directories to ignore when scanning for HTML files. | empty |
| `IgnoreCanonicalBrokenLinks` | When true produces a warning, rather than an error for broken canonical links. When testing a site which isn't live yet or before publishing a new page canonical links will fail. | `true` |
| `IgnoreAltMissing` | Turns off image alt attribute checking. | `false` |
| `IgnoreDirectoryMissingTrailingSlash` | Turns off errors for links to directories without a trailing slash. | `false` |
Expand Down
17 changes: 14 additions & 3 deletions htmltest/attr.go → htmldoc/attr.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package htmltest
package htmldoc

import (
"golang.org/x/net/html"
)

func extractAttrs(attrs []html.Attribute, keys []string) map[string]string {
func ExtractAttrs(attrs []html.Attribute, keys []string) map[string]string {
attrMap := make(map[string]string)
for _, attr := range attrs {
for i, key := range keys {
Expand All @@ -18,11 +18,22 @@ func extractAttrs(attrs []html.Attribute, keys []string) map[string]string {
return attrMap
}

func attrPresent(attrs []html.Attribute, key string) bool {
func AttrPresent(attrs []html.Attribute, key string) bool {
for _, attr := range attrs {
if attr.Key == key {
return true
}
}
return false
}

func GetId(attrs []html.Attribute) string {
for _, attr := range attrs {
if attr.Key == "id" {
return attr.Val
} else if attr.Key == "name" {
return attr.Val
}
}
return ""
}
45 changes: 45 additions & 0 deletions htmldoc/attr_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package htmldoc

import (
"github.com/daviddengcn/go-assert"
"golang.org/x/net/html"
"strings"
"testing"
)

func TestExtractAttrs(t *testing.T) {
snip := "<img src=\"x\" alt=\"y\" />"
nodeDoc, _ := html.Parse(strings.NewReader(snip))
nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild
attrs := ExtractAttrs(nodeImg.Attr, []string{"src", "alt"})

assert.Equals(t, "src", attrs["src"], "x")
assert.Equals(t, "alt", attrs["alt"], "y")
assert.NotEquals(t, "foo", attrs["foo"], "bar")
}

func TestAttrPresent(t *testing.T) {
snip := "<img src=\"x\" alt=\"y\" />"
nodeDoc, _ := html.Parse(strings.NewReader(snip))
nodeImg := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild

assert.Equals(t, "src in attr", AttrPresent(nodeImg.Attr, "src"), true)
assert.Equals(t, "alt in attr", AttrPresent(nodeImg.Attr, "src"), true)
assert.NotEquals(t, "foo in attr", AttrPresent(nodeImg.Attr, "src"), false)
}

func TestAttrValIdId(t *testing.T) {
snip := "<h1 id=\"x\" />"
nodeDoc, _ := html.Parse(strings.NewReader(snip))
nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild

assert.Equals(t, "h1 id", GetId(nodeH1.Attr), "x")
}

func TestAttrValIdName(t *testing.T) {
snip := "<h1 name=\"x\" />"
nodeDoc, _ := html.Parse(strings.NewReader(snip))
nodeH1 := nodeDoc.FirstChild.FirstChild.NextSibling.FirstChild

assert.Equals(t, "h1 name", GetId(nodeH1.Attr), "x")
}
119 changes: 49 additions & 70 deletions htmldoc/document.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,42 @@ package htmldoc
import (
"golang.org/x/net/html"
"os"
"path"
"regexp"
"sync"
)

type Document struct {
FilePath string // Relative to the shell session
SitePath string // Relative to the site root
Directory string
HTMLNode *html.Node
State DocumentState
FilePath string // Relative to the shell session
SitePath string // Relative to the site root
Directory string
htmlMutex *sync.Mutex
htmlNode *html.Node
hashMap map[string]*html.Node
NodesOfInterest []*html.Node
State DocumentState
}

// Used by checks that depend on the document being parsed
type DocumentState struct {
FaviconPresent bool
}

func (doc *Document) Init() {
// Setup the document, doesn't mesh nice with the NewXYZ() convention but
// many optional parameters for Document and no parameter overloading in Go
doc.htmlMutex = &sync.Mutex{}
doc.NodesOfInterest = make([]*html.Node, 0)
doc.hashMap = make(map[string]*html.Node)
}

func (doc *Document) Parse() {
// Parse the document
// Either called when the document is tested or when another document needs
// data from this one.
doc.htmlMutex.Lock() // MUTEX
if doc.htmlNode != nil {
doc.htmlMutex.Unlock() // MUTEX
return
}
// Open, parse, and close document
f, err := os.Open(doc.FilePath)
checkErr(err)
Expand All @@ -29,73 +47,34 @@ func (doc *Document) Parse() {
htmlNode, err := html.Parse(f)
checkErr(err)

doc.HTMLNode = htmlNode
}

func DocumentsFromDir(path string, ignorePatterns []interface{}) []Document {
// Nice proxy for recurseDir
return recurseDir(path, ignorePatterns, "")
doc.htmlNode = htmlNode
doc.parseNode(htmlNode)
doc.htmlMutex.Unlock() // MUTEX
}

func recurseDir(basePath string, ignorePatterns []interface{}, dPath string) []Document {
// Recursive function that returns all Document struts in a given
// os directory.
// basePath: the directory to scan
// dPath: the subdirectory within basePath we're scanning
// ignorePatterns: string slice of dPaths to ignore

documents := make([]Document, 0)

if isDirIgnored(ignorePatterns, dPath) {
return documents
}

// Open directory to scan
f, err := os.Open(path.Join(basePath, dPath))
checkErr(err)
defer f.Close()

// Get FileInfo of directory (scan it)
fi, err := f.Stat()
checkErr(err)

if fi.IsDir() { // Double check we're dealing with a directory
// Read all FileInfo-s from directory, Readdir(count int)
fis, err := f.Readdir(-1)
checkErr(err)

// Iterate over contents of directory
for _, fileinfo := range fis {
fPath := path.Join(dPath, fileinfo.Name())
if fileinfo.IsDir() {
// If item is a dir, we need to iterate further, save returned documents
documents = append(documents, recurseDir(basePath, ignorePatterns, fPath)...)
} else if path.Ext(fileinfo.Name()) == ".html" || path.Ext(fileinfo.Name()) == ".htm" {
// If a file, save to filename list
documents = append(documents, Document{
FilePath: path.Join(basePath, fPath),
SitePath: fPath,
Directory: dPath,
})
}
func (doc *Document) parseNode(n *html.Node) {
if n.Type == html.ElementNode {
// If present save fragment identifier to the hashMap
nodeId := GetId(n.Attr)
if nodeId != "" {
doc.hashMap[nodeId] = n
}
// Identify and store tags of interest
switch n.Data {
case "a", "link", "img", "script":
doc.NodesOfInterest = append(doc.NodesOfInterest, n)
case "pre", "code":
return // Everything within these elements is not to be interpreted
}
} else { // It's a file, return single file
filePath := path.Join(basePath, dPath)
documents = append(documents, Document{
FilePath: filePath,
SitePath: path.Base(filePath),
Directory: dPath,
})
}

return documents
// Iterate over children
for c := n.FirstChild; c != nil; c = c.NextSibling {
doc.parseNode(c)
}
}

func isDirIgnored(ignorePatterns []interface{}, dir string) bool {
for _, item := range ignorePatterns {
if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok {
return true
}
}
return false
func (doc *Document) IsHashValid(hash string) bool {
doc.Parse() // Ensure doc has been parsed
_, ok := doc.hashMap[hash]
return ok
}
117 changes: 117 additions & 0 deletions htmldoc/document_store.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package htmldoc

import (
"os"
"path"
"regexp"
)

type DocumentStore struct {
BasePath string // Path, relative to cwd, the site is located in
IgnorePatterns []interface{} // Regexes of directories to ignore
Documents []*Document // All of the documents, used to iterate over
DocumentPathMap map[string]*Document // Maps slash separated paths to documents
DocumentExtension string // File extension to look for
DirectoryIndex string // What file is the index of the directory
}

func NewDocumentStore() DocumentStore {
return DocumentStore{
Documents: make([]*Document, 0),
DocumentPathMap: make(map[string]*Document),
}
}

func (dS *DocumentStore) AddDocument(doc *Document) {
// Save reference to document to various data stores
dS.Documents = append(dS.Documents, doc)
dS.DocumentPathMap[doc.SitePath] = doc
}

func (dS *DocumentStore) Discover() {
// Find all documents in BasePath
dS.discoverRecurse(".")
}

func (dS *DocumentStore) isDirIgnored(dir string) bool {
// Does path dir match IgnorePatterns?
for _, item := range dS.IgnorePatterns {
if ok, _ := regexp.MatchString(item.(string), dir+"/"); ok {
return true
}
}
return false
}

func (dS *DocumentStore) discoverRecurse(dPath string) {
// Recurse over relative path dPath, saves found documents to dS
if dS.isDirIgnored(dPath) {
return
}

// Open directory to scan
f, err := os.Open(path.Join(dS.BasePath, dPath))
checkErr(err)
defer f.Close()

// Get FileInfo of directory (scan it)
fi, err := f.Stat()
checkErr(err)

if fi.IsDir() { // Double check we're dealing with a directory
// Read all FileInfo-s from directory, Readdir(count int)
fis, err := f.Readdir(-1)
checkErr(err)

// Iterate over contents of directory
for _, fileinfo := range fis {
fPath := path.Join(dPath, fileinfo.Name())
if fileinfo.IsDir() {
// If item is a dir, we delve deeper
dS.discoverRecurse(fPath)
} else if path.Ext(fileinfo.Name()) == dS.DocumentExtension {
// If a file, create and save document
newDoc := &Document{
FilePath: path.Join(dS.BasePath, fPath),
SitePath: fPath,
Directory: dPath,
}
newDoc.Init()
dS.AddDocument(newDoc)
}
}
} else { // It's a file, return single file
panic("discoverRecurse encountered a file: " + dPath)
}

}

func (dS *DocumentStore) ResolvePath(refPath string) (*Document, bool) {
// Resolves internal absolute paths to documents

// Match root document
if refPath == "/" {
d0, b0 := dS.DocumentPathMap[dS.DirectoryIndex]
return d0, b0
}

if refPath[0] == '/' && len(refPath) > 1 {
// Is an absolute link, remove the leading slash for map lookup
refPath = refPath[1:len(refPath)]
}

// Try path as-is, path.ext
d1, b1 := dS.DocumentPathMap[refPath]
if b1 {
// as-is worked, return that
return d1, b1
}

// Try as a directory, path.ext/index.html
d2, b2 := dS.DocumentPathMap[path.Join(refPath, dS.DirectoryIndex)]
return d2, b2
}

func (dS *DocumentStore) ResolveRef(ref *Reference) (*Document, bool) {
return dS.ResolvePath(ref.RefSitePath())
}
Loading

0 comments on commit 22d855b

Please sign in to comment.