Doc server broken link detection

The integration test, when run with -a, now detects and reports broken links and orphaned pages. Broken links either link to an invalid page (404) or have an invalid anchor. Orphaned pages are pages that cannot be navigated to from the home pages just by clicking links. BUG=147747 Review URL: https://chromiumcodereview.appspot.com/17816005 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@212240 0039d316-1c4b-4281-b951-d872f2087c98
280723148 · Jul 18, 2013 · 715ce5e · 715ce5e
1 parent 6cc885e
commit 715ce5e
Show file tree

Hide file tree

Showing 11 changed files with 387 additions and 18 deletions.
diff --git a/chrome/common/extensions/docs/server2/app.yaml b/chrome/common/extensions/docs/server2/app.yaml
@@ -1,5 +1,5 @@
 application: chrome-apps-doc
-version: 2-8-3
+version: 2-9-3
 runtime: python27
 api_version: 1
 threadsafe: false

diff --git a/chrome/common/extensions/docs/server2/cron.yaml b/chrome/common/extensions/docs/server2/cron.yaml
@@ -2,19 +2,19 @@ cron:
 - description: Load everything for trunk.
   url: /_cron/trunk
   schedule: every 5 minutes
-  target: 2-8-3
+  target: 2-9-3
 
 - description: Load everything for dev.
   url: /_cron/dev
   schedule: every 5 minutes
-  target: 2-8-3
+  target: 2-9-3
 
 - description: Load everything for beta.
   url: /_cron/beta
   schedule: every 5 minutes
-  target: 2-8-3
+  target: 2-9-3
 
 - description: Load everything for stable.
   url: /_cron/stable
   schedule: every 5 minutes
-  target: 2-8-3
+  target: 2-9-3
diff --git a/chrome/common/extensions/docs/server2/cron_servlet.py b/chrome/common/extensions/docs/server2/cron_servlet.py
@@ -10,18 +10,16 @@
 from appengine_wrappers import (
     GetAppVersion, DeadlineExceededError, IsDevServer, logservice)
 from branch_utility import BranchUtility
-from caching_file_system import CachingFileSystem
 from compiled_file_system import CompiledFileSystem
 from empty_dir_file_system import EmptyDirFileSystem
+from file_system_util import CreateURLsFromPaths
 from github_file_system import GithubFileSystem
 from host_file_system_creator import HostFileSystemCreator
 from object_store_creator import ObjectStoreCreator
 from render_servlet import RenderServlet
 from server_instance import ServerInstance
 from servlet import Servlet, Request, Response
-from subversion_file_system import SubversionFileSystem
 import svn_constants
-from third_party.json_schema_compiler.memoize import memoize
 
 class _SingletonRenderServletDelegate(RenderServlet.Delegate):
   def __init__(self, server_instance):
@@ -87,16 +85,13 @@ def get_via_render_servlet(path):
     def run_cron_for_dir(d, path_prefix=''):
       success = True
       start_time = time.time()
-      # TODO(jshumway): use server_instance.host_file_system.Walk.
-      # TODO(kalman): delete me where it's set.
-      files = [f for f in server_instance.content_cache.GetFromFileListing(d)
-               if not f.endswith('/') and f != 'redirects.json']
+      files = dict(
+          CreateURLsFromPaths(server_instance.host_file_system, d, path_prefix))
       logging.info('cron/%s: rendering %s files from %s...' % (
           channel, len(files), d))
       try:
-        for i, f in enumerate(files):
+        for i, path in enumerate(files):
           error = None
-          path = '%s%s' % (path_prefix, f)
           try:
             response = get_via_render_servlet(path)
             if response.status != 200:

diff --git a/chrome/common/extensions/docs/server2/file_system_util.py b/chrome/common/extensions/docs/server2/file_system_util.py
@@ -0,0 +1,14 @@
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import posixpath
+
+def CreateURLsFromPaths(file_system, directory, urlprefix):
+  '''Yields a tuple (url, prefix) for every file in |directory|, where the URL
+  is given relative to |urlprefix|.
+  '''
+  for root, _, files in file_system.Walk(directory):
+    for f in files:
+      url = posixpath.join(urlprefix, root[len(directory) + 1:], f)
+      yield url, '%s/%s' % (root, f)
diff --git a/chrome/common/extensions/docs/server2/handler.py b/chrome/common/extensions/docs/server2/handler.py
@@ -3,8 +3,8 @@
 # found in the LICENSE file.
 
 from cron_servlet import CronServlet
-from patch_servlet import PatchServlet
 from instance_servlet import InstanceServlet
+from patch_servlet import PatchServlet
 from servlet import Servlet, Request, Response
 
 _DEFAULT_SERVLET = InstanceServlet.GetConstructor()

diff --git a/chrome/common/extensions/docs/server2/integration_test.py b/chrome/common/extensions/docs/server2/integration_test.py
@@ -8,13 +8,16 @@
 import build_server
 build_server.main()
 
-import logging
+from itertools import groupby
+from operator import itemgetter
 import optparse
 import os
 import sys
 import time
 import unittest
 
+from link_error_detector import LinkErrorDetector
+from local_file_system import LocalFileSystem
 from local_renderer import LocalRenderer
 from fake_fetchers import ConfigureFakeFetchers
 from handler import Handler
@@ -40,6 +43,24 @@ def _GetPublicFiles():
         public_files['/'.join((relative_posix_path, filename))] = f.read()
   return public_files
 
+def _PrintBrokenLinks(broken_links):
+  '''Prints out broken links in a more readable format.
+  '''
+  col_width = max(len(link[0]) for link in broken_links)
+  getter = itemgetter(1)
+
+  def pretty_print(prefix, message):
+    print("%s%s -> %s" % (prefix, (col_width - len(prefix)) * ' ', message))
+
+  for target, links in groupby(sorted(broken_links, key=getter), getter):
+    links = [l[0] for l in links]
+    if len(links) > 50:
+      out = "%s and %d others" % (links[0], len(links) - 1)
+      pretty_print(out, target)
+    else:
+      for link in links:
+        pretty_print(link, target)
+
 class IntegrationTest(unittest.TestCase):
   def setUp(self):
     ConfigureFakeFetchers()
@@ -61,6 +82,33 @@ def testCronAndPublicFiles(self):
     finally:
       print('Took %s seconds' % (time.time() - start_time))
 
+    print("Checking for broken links...")
+    start_time = time.time()
+    link_error_detector = LinkErrorDetector(
+        LocalFileSystem(os.path.join(sys.path[0], os.pardir, os.pardir)),
+        lambda path: Handler(Request.ForTest(path)).Get(),
+        'templates/public',
+        ('extensions/index.html', 'apps/about_apps.html'))
+
+    broken_links, broken_anchors = link_error_detector.GetBrokenLinks()
+    if broken_links or broken_anchors:
+      # TODO(jshumway): Test should fail when broken links are detected.
+      print('Warning: Found %d broken links:' % (
+        len(broken_links + broken_anchors)))
+      _PrintBrokenLinks(broken_links + broken_anchors)
+
+    print('Took %s seconds.' % (time.time() - start_time))
+
+    print('Searching for orphaned pages...')
+    start_time = time.time()
+    orphaned_pages = link_error_detector.GetOrphanedPages()
+    if orphaned_pages:
+      # TODO(jshumway): Test should fail when orphaned pages are detected.
+      print('Warning: Found %d orphaned pages:' % len(orphaned_pages))
+      for page in orphaned_pages:
+        print(page)
+    print('Took %s seconds.' % (time.time() - start_time))
+
     public_files = _GetPublicFiles()
 
     print('Rendering %s public files...' % len(public_files.keys()))
@@ -103,6 +151,9 @@ def testExplicitFiles(self):
       finally:
         print('Took %s seconds' % (time.time() - start_time))
 
+    # TODO(jshumway): Check page for broken links (currently prohibited by the
+    # time it takes to render the pages).
+
   @DisableLogging('warning')
   def testFileNotFound(self):
     response = Handler(Request.ForTest('/extensions/notfound.html')).Get()