Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reading and writing of HTML (compatible) to a simple ADT format. #1680

Merged
merged 23 commits into from
Oct 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
091dd90
first version, not finished
jurgenvinju Sep 24, 2022
1c52ce1
Merge branch 'main' into html-ast
jurgenvinju Oct 17, 2022
06d65cf
HTML parser produces AST nodes now instead of node
jurgenvinju Oct 17, 2022
978406f
perfecting the HTML reader
jurgenvinju Oct 18, 2022
7c14e69
fixed static error in m3 Core
jurgenvinju Oct 18, 2022
717ea3b
added yielding of HTMLElement as XHTML output, both string and file
jurgenvinju Oct 18, 2022
05e941f
set copyright year
jurgenvinju Oct 18, 2022
6548259
fixed some more documentation
jurgenvinju Oct 18, 2022
17e2637
fixed last TODO: allowing multiple text nodes under a single element
jurgenvinju Oct 18, 2022
c057536
removed artifical top node
jurgenvinju Oct 18, 2022
88bcae5
getFirstChild added
jurgenvinju Oct 18, 2022
20f92f6
added commented-out experiment with validation
jurgenvinju Oct 18, 2022
3207b70
cleanup
jurgenvinju Oct 18, 2022
f9422c9
moved DOM to existing dependency jdom instead of org.w3c.dom because …
jurgenvinju Oct 20, 2022
c18b7a6
added example and test, but roundtrip test is broken due to additiona…
jurgenvinju Oct 20, 2022
802196e
switched from editorkit and jdom to jsoup after not being able to str…
jurgenvinju Oct 20, 2022
f15db79
added options to writeHTML{File,String}
jurgenvinju Oct 21, 2022
997a534
finalizing jsoup match
jurgenvinju Oct 21, 2022
086054a
unused import
jurgenvinju Oct 21, 2022
3bfe541
jdom is needed for xml support
jurgenvinju Oct 21, 2022
5e7d4b5
relocated example file
jurgenvinju Oct 21, 2022
1512797
added getCharacterWriter to URIResolverRegistry based on suggestion b…
jurgenvinju Oct 21, 2022
7620804
removed unused stream; thanks @davylandman
jurgenvinju Oct 21, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,15 @@
<artifactId>snakeyaml</artifactId>
<version>1.31</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
</dependency>
<dependency>
<groupId>org.jdom</groupId>
<artifactId>jdom2</artifactId>
<version>2.0.6</version>
<version>2.0.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
Expand Down
337 changes: 337 additions & 0 deletions src/org/rascalmpl/library/lang/html/AST.rsc
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
@synopsis{Plain Algebraic Datatype for HTML}
module lang::html::AST

@synopsis{Abstract Syntax for HTML}
@description{
This is HTML encoded like so:
* <1> element tags are constructor names of type `HTMLElement`
* <2> all tags have a list of HTMLElement as children, except the `void` tags that do not have any parameters
* <3> text nodes and data nodes (which are invisible in HTML) have the \data or text constructor
* <4> attributes are keyword parameters of type `str`
* <5> unknown tags (such as SVG) are mapped to `unknownElement` nodes and their children are not included.
}
data HTMLElement // <1>
= a(list[HTMLElement] elems)
| abbr(list[HTMLElement] elems)
| address(list[HTMLElement] elems)
| area() // <2>
| article(list[HTMLElement] elems)
| aside(list[HTMLElement] elems)
| audio(list[HTMLElement] elems)
| b(list[HTMLElement] elems)
| base() // <2>
| bdi(list[HTMLElement] elems)
| bdo(list[HTMLElement] elems)
| blockquote(list[HTMLElement] elems)
| body(list[HTMLElement] elems)
| br() // <2>
| button(list[HTMLElement] elems)
| canvas(list[HTMLElement] elems)
| caption(list[HTMLElement] elems)
| cite(list[HTMLElement] elems)
| code(list[HTMLElement] elems)
| col()
| colgroup(list[HTMLElement] elems)
| command()
| \data(str dataContent) // <3>
| datalist(list[HTMLElement] elems)
| dd(list[HTMLElement] elems)
| del(list[HTMLElement] elems)
| details(list[HTMLElement] elems)
| dfn(list[HTMLElement] elems)
| dialog(list[HTMLElement] elems)
| div(list[HTMLElement] elems)
| dl(list[HTMLElement] elems)
| dt(list[HTMLElement] elems)
| em(list[HTMLElement] elems)
| embed()
| fieldset(list[HTMLElement] elems)
| figcaption(list[HTMLElement] elems)
| figure(list[HTMLElement] elems)
| footer(list[HTMLElement] elems)
| form(list[HTMLElement] elems)
| h1(list[HTMLElement] elems)
| h2(list[HTMLElement] elems)
| h3(list[HTMLElement] elems)
| h4(list[HTMLElement] elems)
| h5(list[HTMLElement] elems)
| h6(list[HTMLElement] elems)
| head(list[HTMLElement] elems)
| header(list[HTMLElement] elems)
| hgroup(list[HTMLElement] elems)
| hr()
| html(list[HTMLElement] elems)
| i(list[HTMLElement] elems)
| iframe(list[HTMLElement] elems)
| img()
| input()
| ins(list[HTMLElement] elems)
| kbd(list[HTMLElement] elems)
| keygen()
| label(list[HTMLElement] elems)
| legend(list[HTMLElement] elems)
| li(list[HTMLElement] elems)
| link()
| main(list[HTMLElement] elems)
| \map(list[HTMLElement] elems)
| mark(list[HTMLElement] elems)
| menu(list[HTMLElement] elems)
| menuitem(list[HTMLElement] elems)
| meta()
| meter(list[HTMLElement] elems)
| nav(list[HTMLElement] elems)
| noscript(list[HTMLElement] elems)
| object(list[HTMLElement] elems)
| ol(list[HTMLElement] elems)
| optgroup(list[HTMLElement] elems)
| option(list[HTMLElement] elems)
| output(list[HTMLElement] elems)
| p(list[HTMLElement] elems)
| param()
| pre(list[HTMLElement] elems)
| progress(list[HTMLElement] elems)
| q(list[HTMLElement] elems)
| rp(list[HTMLElement] elems)
| rt(list[HTMLElement] elems)
| ruby(list[HTMLElement] elems)
| s(list[HTMLElement] elems)
| samp(list[HTMLElement] elems)
| script(list[HTMLElement] elems)
| section(list[HTMLElement] elems)
| select(list[HTMLElement] elems)
| small(list[HTMLElement] elems)
| source()
| span(list[HTMLElement] elems)
| strong(list[HTMLElement] elems)
| style(list[HTMLElement] elems)
| sub(list[HTMLElement] elems)
| summary(list[HTMLElement] elems)
| sup(list[HTMLElement] elems)
| table(list[HTMLElement] elems)
| tbody(list[HTMLElement] elems)
| td(list[HTMLElement] elems)
| template(list[HTMLElement] elems)
| text(str contents) // <3>
| textarea(list[HTMLElement] elems)
| tfoot(list[HTMLElement] elems)
| th(list[HTMLElement] elems)
| thead(list[HTMLElement] elems)
| time(list[HTMLElement] elems)
| title(list[HTMLElement] elems)
| tr(list[HTMLElement] elems)
| track()
| u(list[HTMLElement] elems)
| ul(list[HTMLElement] elems)
| unknownElement(list[HTMLElement] elems) // <5>
| var(list[HTMLElement] elems)
| video(list[HTMLElement] elems)
| wbr()

;

data HTMLElement( // <4>
str abbr = "",
str about = "",
str accept = "",
str accesskey = "",
str action = "",
str align = "",
str allowfullscreen = "",
str alt = "",
str aria = "",
str async = "",
str autocomplete = "",
str autofocus = "",
str autoplay = "",
str border = "",
str challenge = "",
str char = "",
str charset = "",
str checked = "",
str cite = "",
str class = "",
str cols = "",
str colspan = "",
str command = "",
str content = "",
str contenteditable = "",
str contextmenu = "",
str controls = "",
str coords = "",
str \data = "",
str datatype = "",
str \datetime = "",
str \default = "",
str defer = "",
str dir = "",
str dirname = "",
str disabled = "",
str download = "",
str draggable = "",
str dropzone = "",
str enctype = "",
str \for = "",
str form = "",
str formaction = "",
str formenctype = "",
str formmethod = "",
str formnovalidate = "",
str formtarget = "",
str headers = "",
str height = "",
str hidden = "",
str high = "",
str href = "",
str hreflang = "",
str http = "",
str icon = "",
str id = "",
str inlist = "",
str ismap = "",
str itemid = "",
str itemprop = "",
str itemref = "",
str itemscope = "",
str itemtype = "",
str keytype = "",
str kind = "",
str label = "",
str lang = "",
str language = "",
str \list = "",
str local_ = "",
str loop = "",
str low = "",
str manifest = "",
str max = "",
str maxlength = "",
str media = "",
str mediagroup = "",
str method = "",
str min = "",
str multiple = "",
str muted = "",
str name = "",
str novalidate = "",
str onabort = "",
str onafterprint = "",
str onbeforeprint = "",
str onbeforeunload = "",
str onblur = "",
str oncanplay = "",
str oncanplaythrough = "",
str onchange = "",
str onclick = "",
str oncontextmenu = "",
str ondblclick = "",
str ondrag = "",
str ondragend = "",
str ondragenter = "",
str ondragleave = "",
str ondragover = "",
str ondragstart = "",
str ondrop = "",
str ondurationchange = "",
str onemptied = "",
str onended = "",
str onerror = "",
str onfocus = "",
str onformchange = "",
str onforminput = "",
str onhashchange = "",
str oninput = "",
str oninvalid = "",
str onkeydown = "",
str onkeypress = "",
str onkeyup = "",
str onload = "",
str onloadeddata = "",
str onloadedmetadata = "",
str onloadstart = "",
str onmessage = "",
str onmousedown = "",
str onmousemove = "",
str onmouseout = "",
str onmouseover = "",
str onmouseup = "",
str onmousewheel = "",
str onoffline = "",
str ononline = "",
str onpagehide = "",
str onpageshow = "",
str onpause = "",
str onplay = "",
str onplaying = "",
str onpopstate = "",
str onprogress = "",
str onratechange = "",
str onredo = "",
str onreset = "",
str onresize = "",
str onscroll = "",
str onseeked = "",
str onseeking = "",
str onselect = "",
str onshow = "",
str onstalled = "",
str onstorage = "",
str onsubmit = "",
str onsuspend = "",
str ontimeupdate = "",
str onundo = "",
str onunload = "",
str onvolumechange = "",
str onwaiting = "",
str open = "",
str optimum = "",
str pattern = "",
str ping = "",
str placeholder = "",
str poster = "",
str prefix = "",
str preload = "",
str property = "",
str radiogroup = "",
str readonly = "",
str \rel = "",
str required = "",
str resource = "",
str rev = "",
str reversed = "",
str role = "",
str rows = "",
str rowspan = "",
str sandbox = "",
str scope = "",
str scoped = "",
str seamless = "",
str selected = "",
str shape = "",
str size = "",
str sizes = "",
str span = "",
str spellcheck = "",
str src = "",
str srcdoc = "",
str srclang = "",
str \start = "",
str step = "",
str style = "",
str tabindex = "",
str target = "",
str template = "",
str title = "",
str translate = "",
str \type = "",
str typeof = "",
str usemap = "",
str valign = "",
str \value = "",
str vocab = "",
str width = "",
str wrap = "",
str xml_base = "",
str xml_id = "",
str xml_lang = "",
str xml_space = ""
);
Loading