Given an HTML text string, produces a tree representation of the document.
This module is a complete HTML parser, supporting HTML 3.2 to 4.1 and XHTML 1.0.
〈Overview〉 ≡
load-html: func [
"Load HTML text into a tree"
html [string!]
/with niwashi-rules [block!] "Use transformation rules for the niwashi"
/local 〈load-html's locals〉
] [
〈Parse html and generate a tree〉
]
form-html: func [
{Forms a HTML tree (eg. from LOAD-HTML) into HTML text}
html [block!]
/with options "Specify format options"
] [
〈Form html into HTML text〉
]
〈Options for form-html〉 ≡
pretty?: false ; pretty print output (indent etc.)
utf8?: false ; output UTF-8 instead of ASCII
〈Parse html and generate a tree〉 ≡
niwashi: make-niwashi
define-rules niwashi html-rules
if niwashi-rules [
define-rules niwashi niwashi-rules
]
parse-ml html func [cmd data] [
switch cmd [
text whitespace comment declaration xml-proc [
append-child niwashi [type: cmd properties: [value: data]]
]
; simple cases
<html> <head> <title> <script> <style> <object> [
enter-child niwashi [type: tag-to-word cmd properties: data]
]
<body> <legend> <caption> <fieldset> <noscript> <ins> <del> <iframe> <tt> <i> <b> <u>
<strike> <s> <big> <small> <sub> <sup> <em> <strong> <dfn> <code> <samp> <kbd> <var>
<cite> <font> <select> <textarea> <button> <optgroup> <label> <span> <abbr>
<acronym> <q> <applet> <video> [
attempt [
split-branch niwashi 'head
leave-child niwashi
]
enter-child niwashi [type: tag-to-word cmd properties: data]
]
<isindex> <isindex/> <base> <base/> <script/> <meta> <meta/> <link> <link/> <param> <param/> [
append-child niwashi [type: tag-to-word cmd properties: data]
]
<col> <col/> <br> <br/> <basefont> <basefont/> <area> <area/> <input> <input/> <source> <source/> [
attempt [
split-branch niwashi 'head
leave-child niwashi
]
append-child niwashi [type: tag-to-word cmd properties: data]
]
; block level
<h1> <h2> <h3> <h4> <h5> <h6> <address> <p> <ul> <ol> <dl> <pre> <dt> <dd>
<div> <center> <blockquote> [
open-tag tag-to-word cmd data [h1 h2 h3 h4 h5 h6 address p dt dd] [table]
]
<table> [
; allow table inside p because people are stupid
open-tag 'table data [h1 h2 h3 h4 h5 h6 address dt dd] [ ]
]
<li> [
open-tag 'li data [li h1 h2 h3 h4 h5 h6 address p dt dd] [ul ol]
]
<form> [
open-tag 'form data 'form [ ]
]
<tr> [
open-tag 'tr data [tr td th colgroup] 'table
]
<td> <th> [
open-tag tag-to-word cmd data [td th] 'table
]
<thead> <tfoot> <tbody> [
open-tag tag-to-word cmd data [thead tfoot tbody tr td th colgroup] 'table
]
<colgroup> [
open-tag 'colgroup data 'colgroup 'table
]
<hr> <hr/> [
attach?: no
unless attempt [
split-branch niwashi 'head
leave-child niwashi
true
] [
attempt [
split-branch/knots niwashi [h1 h2 h3 h4 h5 h6 address p dt] 'table
leave-child niwashi
attach?: yes
]
]
append-child niwashi [type: 'hr properties: data]
; ???
if attach? [attach-branch niwashi]
]
; (non simple) inline level
<a> <map> <option> [
open-tag cmd: tag-to-word cmd data cmd [ ]
]
</tt> </i> </b> </u> </strike> </s> </big> </small> </sub> </sup>
</em> </strong> </dfn> </code> </samp> </kbd> </var> </cite>
</a> </font> </map> </label> </span> </abbr> </acronym> </q> [
attempt [
split-branch niwashi tag-to-word cmd
leave-child niwashi
attach-branch niwashi
]
]
; rebol.com uses the spelling <image>... and FF accepts it!
<img> <img/> <image> <image/> [
attempt [
split-branch niwashi 'head
leave-child niwashi
]
append-child niwashi [type: 'img properties: data]
]
; closing tags
</head> </title> </script> </style> </object> </legend> </caption>
</fieldset> </noscript> </ins> </del> </iframe> </video>
; attach-branch?
</h1> </h2> </h3> </h4> </h5> </h6> </address> </ul> </ol> </li> </dl> </dt>
</dd> </pre> </div> </center> </blockquote> </form> </table>
</tr> </td> </th> </colgroup> </thead> </tfoot> </tbody>
</select> </textarea> </button> </option> </optgroup> [
attempt [
split-branch niwashi tag-to-word cmd
leave-child niwashi
]
]
</p> [
attempt [
split-branch/knots niwashi 'p 'table
leave-child niwashi
]
]
]
]
leave-all niwashi
result: niwashi/root
either html-node: get-node result/childs/html [
unless body-node: get-node html-node/childs/body [
body-node: make-node 'body
set-node body-node/parent: html-node
]
unless head-node: get-node html-node/childs/head [
head-node: make-tree [head [ ] [title [ ]]]
set-node body-node/previous: head-node
]
unless get-node head-node/childs/title [
either empty? get-node head-node/childs [
title-node: make-node 'title
set-node title-node/parent: head-node
] [
set-node head-node/childs/1/previous: make-node 'title
]
]
] [
enter-child niwashi [type: 'html]
enter-child niwashi [type: 'head]
append-child niwashi [type: 'title]
leave-child niwashi
append-child niwashi [type: 'body]
leave-child niwashi
]
result
〈load-html's locals〉 ≡
attach? result html-node head-node body-node title-node
〈Form html into HTML text〉 ≡
options: make default-fh-options any [options [ ]]
emit-childs copy "" html copy "" pick [html-utf8 html-ascii] to logic! options/utf8? to logic! options/pretty?
Inline level nodes:
text tt i b u s strike big small em strong dfn code samp kbd var cite abbr acronym
a img applet object font basefont br script map q sub sup span bdo iframe
input select textarea label button
Block level nodes:
p h1 h2 h3 h4 h5 h6 ul ol dir menu pre dl div center noscript noframes blockquote form
isindex hr table fieldset address
Flow means both block and inline.
〈Overview〉 +≡
inside-flow: [
on whitespace add-space
on text merge-text
ignore [td th caption tr thead tbody tfoot col colgroup]
after [
p h1 h2 h3 h4 h5 h6 ul ol dir menu pre dl div center noscript noframes
blockquote form isindex hr table fieldset address
] outside-flow
]
outside-flow: [
ignore [td th caption tr thead tbody tfoot col colgroup whitespace]
after [
text tt i b u s strike big small em strong dfn code samp kbd var cite abbr acronym
a font q sub sup span bdo
] inside-flow
]
html-rules: [
except [html comment declaration xml-proc] force html
ignore whitespace
inside html [
except [comment head body] force body
ignore [whitespace html declaration xml-proc]
on [title isindex base script style meta link object] force head
inside head [
ignore [whitespace head html declaration xml-proc]
inside title [
on text merge-text
after text [
on text merge-text
on whitespace add-space
ignore [declaration xml-proc html head title script style object]
]
ignore [whitespace declaration xml-proc html head title script style object]
]
inside object [
only param
]
]
inside body outside-flow
inside body [
always [
ignore [declaration xml-proc html head body]
on legend force fieldset
on style move-to-head
;on [title base meta link style] move to head
inside [pre textarea] [
always [
; need to add more here
ignore [td th caption tr thead tbody tfoot col colgroup]
on [text whitespace] preserve-whitespace
on br add-newline
]
]
inside all but [table thead tbody tfoot tr td th caption colgroup select] [
ignore [td th caption tr thead tbody tfoot col colgroup option optgroup]
]
inside all but video [
ignore source
]
inside table [
only [thead tfoot tbody tr td caption th col colgroup]
on tr force tbody
on [td th] force tr
inside caption inside-flow
inside [thead tfoot tbody] [
only [tr td th]
on [td th] force tr
inside tr [
only [td th]
inside [td th] inside-flow
]
]
inside colgroup [
only col
]
]
inside select [
only [option optgroup]
]
inside optgroup [
only option
]
inside [
blockquote center dd del div dl fieldset form ins legend li noscript ol
ul
] outside-flow
inside [
h1 h2 h3 h4 h5 h6 p address dt caption td th
tt i b u s strike big small em strong dfn code samp kbd var cite abbr acronym
a applet object font map q sub sup span bdo iframe video
option textarea label button
] inside-flow
]
]
]
]
!set-node-value-quick: macro [node val] [(:poke) node 3 (:reduce) ['value val]]
merge-text: func [node /local prev] expand-macros [
if all [prev: !get-node-previous node 'text = !get-node-type prev] [
insert tail !get-node-property prev 'value !get-node-property node 'value
!remove-node-quick node
]
]
add-space: func [node /local prev text] expand-macros [
either all [prev: !get-node-previous node 'text = !get-node-type prev] [
text: !get-node-property prev 'value
unless #" " = last text [insert tail text #" "]
!remove-node-quick node
] [
!set-node-type node 'text
!set-node-value-quick node (copy " ")
]
]
preserve-whitespace: func [node] expand-macros [
!set-node-type node 'text
merge-text node
]
add-newline: func [node] expand-macros [
!set-node-properties node (copy/deep [value "^/"])
preserve-whitespace node
]
move-to-head: func [node /local head-node root] [
root: niwashi/root
unless head-node: get-node root/childs/html/childs/head [
head-node: make-node 'head
set-node root/childs/html/childs/body/previous: head-node
]
set-node node/parent: head-node
]
〈Overview〉 +≡
default-fh-options: context [
〈Options for form-html〉
]
tag-to-word: func [tag] compose [
(:to) (word!) (:lowercase) trim/with (:to) (string!) tag #"/"
]
open-tag: func [type prop split knots /local attach?] [
unless attempt [
split-branch niwashi 'head
leave-child niwashi
true
] [
attach?: attempt [
split-branch/knots niwashi split knots
leave-child niwashi
true
]
]
enter-child niwashi [type: type properties: prop]
if attach? [attach-branch niwashi]
]
!emit: macro [value] [(:insert) (:tail) output value]
!indent: macro [] [(:head) (:insert) (:tail) (:copy) indent " "]
!emit-attributes: macro [attributes encoding] [
(:foreach) [attrname attrvalue] attributes [
(:if) :attrvalue [
(:either) (:word?) attrname [
(:insert) (:insert) (:insert) (:tail) output #" " attrname {="}
encode-text/to :attrvalue encoding output
(:insert) (:tail) output #"^""
] [
(:insert) (:insert) (:insert) (:insert) (:insert) (:tail) output
#" " (:pick) attrname 1 #":" (:pick) attrname 2 {="}
encode-text/to :attrvalue encoding output
(:insert) (:tail) output #"^""
]
]
]
]
!open-tag: macro [name attributes encoding] expand-macros [
(:insert) (:insert) (:tail) output #"<" name
!emit-attributes attributes encoding
(:insert) (:tail) output #">"
]
!empty-tag: macro [name attributes encoding] expand-macros [
(:insert) (:insert) (:tail) output #"<" name
!emit-attributes attributes encoding
(:insert) (:tail) output " />"
]
!close-tag: macro [name] [
(:insert) (:insert) (:insert) (:tail) output "</" name #">"
]
!emit-cdata: macro [text] [
(:insert) (:insert) (:insert) (:tail) output
"^//* <![CDATA[ */^/"
text
"^//* ]]> */^/"
]
!get-inside-text: macro [node] expand-macros [
(:either) (:empty?) !get-node-childs node [""] [
(:select) (:third) (:fourth) node 'value
]
]
emit-childs: func [output node indent encoding pretty? /local type] expand-macros [
foreach child !get-node-childs node [
switch/default type: !get-node-type child [
text whitespace [
encode-text/to !get-node-property child 'value encoding output
]
xml-proc declaration [
!emit (!get-node-property child 'value)
if pretty? [
insert !emit #"^/" indent
]
]
comment [
!emit (!get-node-property child 'value)
]
style [
!open-tag 'style (!get-node-properties child) encoding
!emit-cdata (!get-inside-text child)
if pretty? [!emit indent]
!close-tag 'style
if pretty? [insert !emit #"^/" indent]
]
] [
either find [
base link hr area input img br col isindex script meta
basefont source
] type [
!empty-tag type (!get-node-properties child) encoding
] [
!open-tag type (!get-node-properties child) encoding
if all [
pretty?
find [
html head script style object body noscript
ul ol dl div center blockquote
table form tr thead tfoot tbody
] type
] [
insert !emit "^/ " indent
]
emit-childs output child !indent encoding pretty?
!close-tag type
]
if all [
pretty?
find [
html head title script style object body legend caption
fieldset noscript iframe video isindex base meta link br basefont
h1 h2 h3 h4 h5 h6 address p ul ol dl pre dt dd div center blockquote
table li form tr td th thead tfoot tbody hr source
] type
] [
insert !emit #"^/" indent
]
]
]
output
]