Defines a function that can convert a HTML text string into a plain text string.
This module defines a function, html-to-text, that can convert a HTML string into just plain text. Currently the conversion is rather simple, but it is easy to improve it.
The source html string is assumed to use the UTF-8 encoding. If your text uses any other encoding, you must convert it to UTF-8 before calling this function.
The output string will always be UTF-8 encoded.
〈Overview〉 ≡
〈Conversion to text (stage 3)〉
html-to-text: func [
"Convert HTML to plain text"
html [string!]
] [
〈Parse the html text and generate plain text〉
]
We use a multi-stage pipeline of finite state machines (FSMs). The first stage uses parse-ml on the source text, parsing tags, text and character entities, and sends the parsed elements to the second stage, the html-normalizer module, via the process-tag function; that in turn sends the normalized elements to the stage3 function.
〈Parse the html text and generate plain text〉 ≡
init-stage3
init-normalizer :stage3
parse-ml html :process-tag
reset-normalizer
end-stage3
〈Conversion to text (stage 3)〉 ≡
stage3: func [command command-data] [
process-event stage3-fsm command command-data
]
init-stage3: does [
out: make string! 1024
uri-base: none
clear links
clear indent
reset-fsm/only stage3-fsm
]
end-stage3: does [
reset-fsm stage3-fsm
emit newline
forall links [
emit [#"[" index? links "] "]
emit-uri links/1
emit newline
]
out
]
links: [ ]
para-buffer: ""
stage3-fsm: make-fsm [
initial-state: [
<html> in-html
]
in-html: [
<head> in-head (unless empty? out [emit newline])
<body> in-body
</html> return
]
in-head: [
</head> return
<title> in-title
<base/> (uri-base: any [select data 'href uri-base])
<style> in-style
;<meta/>
;<link/> ( ; turns out to be not very useful in practice
; emit any [select data 'rel select data 'rev "LINK: "]
; emit-uri select data 'href
; emit newline
;)
]
in-style: [
</style> return
]
in-title: [
</title> return
]
in-body: [
</body> return
<style> in-style
<h1> (clear para-buffer) in-para (
uppercase para-buffer
emit-para
emit [
indent copy/part {======================================================================} 70 - length? indent
newline
indent newline
]
)
<h2> <h3> <h4> <h5> <h6> (clear para-buffer) in-para (
uppercase para-buffer
emit-para
emit [indent newline]
)
<address> <p> <legend> (clear para-buffer) in-para (emit-para emit [indent newline])
<pre> (clear para-buffer) in-pre (emit para-buffer)
<ul> (increase-indent) in-ulist (decrease-indent emit [indent newline])
<ol> (increase-indent count: any [if count*: select data 'start [attempt [to integer! count*]] 1]) in-olist (decrease-indent emit [indent newline])
<dl> (increase-indent) in-dlist (decrease-indent emit [indent newline])
;<center>
<blockquote> (increase-indent) in-blockquote (decrease-indent)
<table> (
emit [
indent copy/part {== TABLE =============================================================} 70 - length? indent
newline
]
) in-table (
emit [
indent copy/part {======================================================================} 70 - length? indent
newline
indent newline
]
)
<hr/> (emit [
indent copy/part {----------------------------------------------------------------------} 70 - length? indent
newline
indent newline
])
]
in-blockquote: inherit in-body [
</blockquote> return
]
in-para: [
</h1> </h2> </h3> </h4> </h5> </h6> </address> </p>
</dt> </pre> </caption> </legend>
return
;<tt> <i> <b> <u> <strike> <s> <big> <small> <sub> <sup>
;<em> <strong> <dfn> <code> <samp> <kbd> <var> <cite>
;<abbr> <acronym>
<a> (append links select data 'href) in-link (repend para-buffer [" [" length? links "] "])
<select> in-select
;<textarea> in-option
;<button>
;<label>
;<input/>
<q> (record #{E2809C})
</q> (record #{E2809D})
;<object>
<img/> (record "[IMAGE]") ; show src="" ?
<br/> (emit-para)
whitespace: (record " ")
text: (record data)
]
in-pre: inherit in-para [
text: whitespace: (record data)
<br/> (record newline)
]
in-link: inherit in-para [
</a> return
]
; we are supporting lists and tables only for very simple cases...
in-item: inherit in-para [
</li> </dt> </dd> </td> </th> return
]
in-ulist: [
<li> (clear para-buffer) in-item (emit-para/with " * ")
<ul> (increase-indent) in-ulist (decrease-indent)
<ol> (increase-indent) in-olist (decrease-indent)
</ul> </ol> return
]
in-olist: inherit in-ulist [
<li> (
clear para-buffer
if all [
count*: select data 'value
attempt [count*: to integer count*]
] [
count: count*
]
) in-item (emit-para/with rejoin [#" " count ". "] count: count + 1)
]
in-dlist: [
<dt> (clear para-buffer) in-item (record #":" emit-para)
<dd> (clear para-buffer) in-item (emit-para emit [indent newline])
</dl> return
]
in-table: [
<caption> (clear para-buffer emit [indent "Table caption:" newline]) in-para (emit-para emit [indent newline])
<thead> <tfoot> <tbody> (emit [
indent copy/part {----------------------------------------------------------------------} 70 - length? indent
newline
]) in-rows
<tr> continue in-rows
</table> return
]
in-rows: [
<tr> (
emit [indent "--" newline]
) in-cells
</thead> </tfoot> </tbody> return
</table> 2 return ; no <tbody>
]
in-cells: [
<td> <th> (append indent " || " emit [indent "||" newline]) in-cell (decrease-indent)
</tr> return
]
in-cell: inherit in-body [
</td> </th> return
text: (clear para-buffer) continue in-cell-para (emit-para)
]
in-cell-para: inherit in-para [
</td> </th> 2 return
]
in-select: [
</select> return
]
]
emit: func [value] [repend out value]
indent: ""
increase-indent: does [append indent " "]
decrease-indent: does [clear skip tail indent -6]
break-at: complement charset [#"0" - #"9" #"A" - #"Z" #"a" - #"z" {"'#$%&([^{@} #"" - #""]
emit-para: has [pos /with bullet] [
emit indent
if bullet [
change skip tail out negate length? bullet bullet
bullet: none
]
while [(length? para-buffer) > (70 - length? indent)] [
pos: skip para-buffer 71 - length? indent
; bug: /tail does not work with charset
pos: find/reverse pos break-at
if any [not pos (index? pos) <= index? para-buffer] [
pos: skip para-buffer 71 - length? indent
pos: any [find pos break-at tail pos]
]
pos: next pos
insert/part tail out para-buffer para-buffer: pos
emit newline
emit indent
]
emit para-buffer
emit newline
clear para-buffer: head para-buffer
]
record: func [text] [
append para-buffer text
]
emit-uri: func [uri] [
; needs to be smarter.
if uri-base [emit uri-base]
emit uri
]