From 5fa05e53f6d8c73d282cc3182927c8a8c8dee8b4 Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Thu, 17 May 2018 22:22:23 +0900 Subject: [PATCH] Optimize header parsing Fixes #505 Benchmarks: "setext": ```bash ruby -rbenchmark -Ilib -rkramdown -e 'p Benchmark.measure{Kramdown::Document.new("1#{" "*20000}2\n==\n")}' ``` "atx": ```bash ruby -rbenchmark -Ilib -rkramdown -e 'p Benchmark.measure{Kramdown::Document.new("## 1#{" "*20000}2")}' ``` --- .gitignore | 5 +++ lib/kramdown/parser/gfm.rb | 19 ++++++--- lib/kramdown/parser/kramdown/header.rb | 39 +++++++++++++------ test/testcases/block/04_header/atx_header.hcd | 0 4 files changed, 47 insertions(+), 16 deletions(-) create mode 100644 test/testcases/block/04_header/atx_header.hcd diff --git a/.gitignore b/.gitignore index a2990bfb..7e3ec420 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ htmldoc pkg webgen-tmp +.bundle/ +CONTRIBUTERS +VERSION +kramdown.gemspec +man/man1/kramdown.1 diff --git a/lib/kramdown/parser/gfm.rb b/lib/kramdown/parser/gfm.rb index 3ae5396a..756179d1 100644 --- a/lib/kramdown/parser/gfm.rb +++ b/lib/kramdown/parser/gfm.rb @@ -112,19 +112,28 @@ def generate_gfm_header_id(text) @options[:auto_id_prefix] + result end - ATX_HEADER_START = /^\#{1,6}\s/ + ATX_HEADER_START = /^(?\#{1,6})[\t ]+(?.*)\n/ define_parser(:atx_header_gfm, ATX_HEADER_START, nil, 'parse_atx_header') define_parser(:atx_header_gfm_quirk, ATX_HEADER_START) # Copied from kramdown/parser/kramdown/header.rb, removed the first line def parse_atx_header_gfm_quirk - start_line_number = @src.current_line_number - @src.check(ATX_HEADER_MATCH) - level, text, id = @src[1], @src[2].to_s.strip, @src[3] + text = @src["contents"] + text.rstrip! + id_match = HEADER_ID.match(text) + if id_match + id = id_match["id"] + text = text[0...-id_match[0].length] + text.rstrip! + end + text.sub!(/[\t ]#+\z/, '') + text.rstrip! return false if text.empty? + level = @src["level"].length + start_line_number = @src.current_line_number @src.pos += @src.matched_size - el = new_block_el(:header, nil, nil, :level => level.length, :raw_text => text, :location => start_line_number) + el = new_block_el(:header, nil, nil, :level => level, :raw_text => text, :location => start_line_number) add_text(text, el) el.attr['id'] = id if id @tree.children << el diff --git a/lib/kramdown/parser/kramdown/header.rb b/lib/kramdown/parser/kramdown/header.rb index aa882dd9..fc2af227 100644 --- a/lib/kramdown/parser/kramdown/header.rb +++ b/lib/kramdown/parser/kramdown/header.rb @@ -13,18 +13,27 @@ module Kramdown module Parser class Kramdown - HEADER_ID=/(?:[ \t]+\{#([A-Za-z][\w:-]*)\})?/ - SETEXT_HEADER_START = /^(#{OPT_SPACE}[^ \t].*?)#{HEADER_ID}[ \t]*?\n(-|=)+\s*?\n/ + HEADER_ID = /[\t ]{#(?[A-Za-z][\w:-]*)}\z/ + SETEXT_HEADER_START = /^#{OPT_SPACE}(?.*)\n(?[-=])[-=]*[ \t\r\f\v]*\n/ # Parse the Setext header at the current location. def parse_setext_header return false if !after_block_boundary? + text = @src["contents"] + text.strip! + id_match = HEADER_ID.match(text) + if id_match + id = id_match["id"] + text = text[0...-id_match[0].length] + text.rstrip! + end + return false if text.empty? + level = @src["level"] == '-' ? 2 : 1 + start_line_number = @src.current_line_number @src.pos += @src.matched_size - text, id, level = @src[1], @src[2], @src[3] - text.strip! - el = new_block_el(:header, nil, nil, :level => (level == '-' ? 2 : 1), :raw_text => text, :location => start_line_number) + el = new_block_el(:header, nil, nil, :level => level, :raw_text => text, :location => start_line_number) add_text(text, el) el.attr['id'] = id if id @tree.children << el @@ -33,20 +42,28 @@ def parse_setext_header define_parser(:setext_header, SETEXT_HEADER_START) - ATX_HEADER_START = /^\#{1,6}/ - ATX_HEADER_MATCH = /^(\#{1,6})(.+?(?:\\#)?)\s*?#*#{HEADER_ID}\s*?\n/ + ATX_HEADER_START = /^(?\#{1,6})[\t ]*(?.*)\n/ # Parse the Atx header at the current location. def parse_atx_header return false if !after_block_boundary? - start_line_number = @src.current_line_number - @src.check(ATX_HEADER_MATCH) - level, text, id = @src[1], @src[2].to_s.strip, @src[3] + text = @src["contents"] + text.rstrip! + id_match = HEADER_ID.match(text) + if id_match + id = id_match["id"] + text = text[0...-id_match[0].length] + text.rstrip! + end + text.sub!(/[\t ]#+\z/, '') + text.rstrip! return false if text.empty? + level = @src["level"].length + start_line_number = @src.current_line_number @src.pos += @src.matched_size - el = new_block_el(:header, nil, nil, :level => level.length, :raw_text => text, :location => start_line_number) + el = new_block_el(:header, nil, nil, :level => level, :raw_text => text, :location => start_line_number) add_text(text, el) el.attr['id'] = id if id @tree.children << el diff --git a/test/testcases/block/04_header/atx_header.hcd b/test/testcases/block/04_header/atx_header.hcd new file mode 100644 index 00000000..e69de29b