Browse Source

Added html parser

typed
Weird Constructor 6 months ago
parent
commit
edc2f255fb
  1. 7
      Cargo.toml
  2. 221
      src/stdlib/html2vval.rs
  3. 2
      src/stdlib/imap.rs
  4. 4
      src/stdlib/mod.rs
  5. 508
      src/stdlib/rcdom.rs
  6. 26
      src/stdlib/util.rs

7
Cargo.toml

@ -19,10 +19,11 @@ default = [
mqtt = ["rumqttd", "rumqttc"]
http = ["reqwest", "rouille"]
mail = ["native-tls", "imap", "mailparse", "imap-proto"]
html = ["html5ever", "tendril", "markup5ever"]
cursive = ["dep:cursive", "cursive_buffered_backend", "unicode-width"]
odbc = ["odbc-api", "odbc-sys"]
clipboard = ["copypasta"]
all = ["mqtt", "http", "cursive", "zip", "odbc", "clipboard", "sqlite", "mail"]
all = ["mqtt", "http", "cursive", "zip", "odbc", "clipboard", "sqlite", "mail", "html"]
[dependencies]
fnv = "1.0.7"
@ -54,6 +55,10 @@ unicode-width = { version = "0.1", optional = true }
odbc-api = { version = "0.57.0", optional = true }
odbc-sys = { version = "0.21.3", optional = true }
copypasta = { version = "0.8.1", optional = true }
html5ever = { version = "0.26.0", optional = true }
tendril = { version = "0.4.3", optional = true }
markup5ever = { version = "0.11.0", optional = true }
#xml5ever = { version = "0.16" }
#cursive = { path = "../cursive/cursive", optional = true, features = ["crossterm-backend"], default-features = false }
#rumqttc = { path = "../other/rumqtt/rumqttc", optional = true }
#rumqttd = { path = "../other/rumqtt/rumqttd", optional = true }

221
src/stdlib/html2vval.rs

@ -0,0 +1,221 @@
use super::rcdom::*;
use crate::vval::*;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use tendril::StrTendril;
fn handle2vval(handle: &Handle, prune_childs: bool) -> VVal {
let v_node = VVal::map();
match &handle.data {
NodeData::Document => {
let _ = v_node.set_key_str("%type", VVal::new_sym("document")).unwrap();
}
NodeData::Text { contents } => {
let _ = v_node.set_key_str("%type", VVal::new_sym("text")).unwrap();
let _ = v_node
.set_key_str("%data", VVal::new_str_mv(contents.borrow().to_string()))
.unwrap();
}
NodeData::Element { name, attrs, .. } => {
let _ = v_node.set_key_str("%type", VVal::new_sym("element")).unwrap();
let _ = v_node.set_key_str("%name", VVal::new_sym(&name.local)).unwrap();
let v_attrs = VVal::vec();
for attr in attrs.borrow().iter() {
let value = VVal::new_str_mv(attr.value.to_string());
let _ = v_node.set_key_str(&(String::from("@") + &attr.name.local), value.clone());
v_attrs.push(VVal::pair(VVal::new_sym(&attr.name.local), value));
}
let _ = v_node.set_key_str("%attrs", v_attrs);
}
_ => {
let _ = v_node.set_key_str("%type", VVal::new_sym("unknown"));
}
}
let mut childs = VVal::None;
for child in handle.children.borrow().iter() {
if childs.is_none() {
childs = VVal::vec();
}
let v_child = handle2vval(child, prune_childs);
let c_type = v_child.v_s_rawk("%type");
if c_type == "text" {
let mut pre_text = v_node.v_s_rawk("%text");
pre_text = v_child.v_with_s_refk("%data", |s| pre_text + s);
let _ = v_node.set_key_str("%text", VVal::new_str_mv(pre_text));
let mut pre_text = v_node.v_s_rawk("%rtext");
pre_text = v_child.v_with_s_refk("%data", |s| pre_text + s);
let _ = v_node.set_key_str("%rtext", VVal::new_str_mv(pre_text));
} else if c_type == "element" {
let name = v_child.v_k("%name");
name.with_s_ref(|e_name| {
if !v_node.v_k(e_name).is_some() {
let _ = v_node.set_key_str(e_name, VVal::vec());
}
v_node.v_k(e_name).push(v_child.clone());
});
}
if v_child.v_k("%rtext").is_some() {
if v_node.v_k("%rtext").is_some() {
let mut pre_text = v_node.v_s_rawk("%rtext");
pre_text = v_child.v_with_s_refk("%rtext", |s| pre_text + s);
let _ = v_node.set_key_str("%rtext", VVal::new_str_mv(pre_text));
} else {
let _ = v_node.set_key_str("%rtext", v_child.v_k("%rtext"));
}
}
childs.push(v_child);
}
if !prune_childs && childs.is_some() {
let _ = v_node.set_key_str("%childs", childs);
}
v_node
}
fn handle2vval_simplified(handle: &Handle, parent: &VVal, level: u32) {
let v_node = match &handle.data {
NodeData::Text { contents } => {
parent.v_(1).v_k("_").push(VVal::new_str_mv(contents.borrow().to_string()));
return;
}
NodeData::Document => {
parent.clone()
}
NodeData::Element { name, attrs, .. } => {
let v_attrs = VVal::map1("_", VVal::vec());
for attr in attrs.borrow().iter() {
let value = VVal::new_str_mv(attr.value.to_string());
let _ = v_attrs.set_key_str(&attr.name.local, value);
}
let v_node = VVal::pair(VVal::new_sym(&name.local), v_attrs);
parent.v_(1).v_k("_").push(v_node.clone());
v_node
}
_ => {
let v_node = VVal::pair(VVal::new_sym("unknown"), VVal::map1("_", VVal::vec()));
parent.v_(1).v_k("_").push(v_node.clone());
v_node
}
};
for child in handle.children.borrow().iter() {
handle2vval_simplified(child, &v_node, level + 1);
// let name = v_child.v_k("%name");
// name.with_s_ref(|e_name| {
// if !v_node.v_k(e_name).is_some() {
// let _ = v_node.set_key_str(e_name, VVal::vec());
// }
//
// v_node.v_k(e_name).push(v_child.clone());
// });
}
}
pub fn parse(s: &str, prune_childs: bool) -> VVal {
let mut result_tok = parse_document(RcDom::default(), Default::default());
result_tok.process(StrTendril::from(s));
let dom = result_tok.finish();
// println!("DOM: {:?}", dom);
handle2vval(&dom.document, prune_childs)
}
pub fn parse_simplified(s: &str) -> VVal {
let mut result_tok = parse_document(RcDom::default(), Default::default());
result_tok.process(StrTendril::from(s));
let dom = result_tok.finish();
// println!("DOM: {:?}", dom);
let res = VVal::pair(VVal::new_sym("root"), VVal::map1("_", VVal::vec()));
handle2vval_simplified(&dom.document, &res, 0);
res
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_rcdom2vval() {
let v = parse("<a hannes=\"wurstkopp\" test=3 test=4>mit wurst\n <i>in</i> kaka\n</a><a href=börg>penis</a>", false);
assert_eq!(v.v_s_rawk("%rtext"), "mit wurst\n in kaka\npenis");
assert_eq!(
v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(0).v_s_rawk("%rtext"),
"mit wurst\n in kaka\n"
);
assert_eq!(
v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(0).v_s_rawk("%text"),
"mit wurst\n kaka\n"
);
assert_eq!(
v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(0).v_s_rawk("%attrs"),
"$[$p(:hannes,\"wurstkopp\"),$p(:test,\"3\")]"
);
assert_eq!(
v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(0).v_s_rawk("@hannes"),
"wurstkopp"
);
assert_eq!(v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(0).v_s_rawk("@test"), "3");
assert_eq!(v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(1).v_s_rawk("%text"), "penis");
assert_eq!(
v.v_k("html").v_(0).v_k("body").v_(0).v_k("a").v_(1).v_s_rawk("%attrs"),
"$[$p(:href,\"börg\")]"
);
}
fn run_wlambda_p(wlcode: &str, html: &str, prune_childs: bool) -> String {
use crate::EvalContext;
let v = parse(html, prune_childs);
let mut ctx = EvalContext::new_default();
ctx.set_global_var("@@", &v);
ctx.eval(wlcode).unwrap().s()
}
fn run_wlambda(wlcode: &str, html: &str) -> String {
run_wlambda_p(wlcode, html, false)
}
#[test]
fn test_dom_wlambda_selector() {
let html1 = "<a hannes=\"wurstkopp\" test=3 test=4>mit wurst\n <i @x=32>in</i> kaka\n</a><a href=börg>penis</a>";
assert_eq!(
run_wlambda("$S{html/0/body/*/%rtext} @@ | 0", html1),
"\"mit wurst\\n in kaka\\npenis\""
);
assert_eq!(run_wlambda("$S{**/@test} @@ | 0", html1), "\"3\"");
assert_eq!(run_wlambda("$S{**/@@x} @@ | 0", html1), "\"32\"");
assert_eq!(
run_wlambda("$S[ html/0/body/0/a/0/%attrs ] @@", html1),
"$[$[$p(:hannes,\"wurstkopp\"),$p(:test,\"3\")]]"
);
let html2 = "<a hannes=\"wurstkopp\" test=3 test=4>mit wurst\n <i @x=32>in</i> kaka\n</a><a href=börg id=\"xxx\">penis</a>";
assert_eq!(
run_wlambda(
"#std:displayln ~ std:ser:json @@;\n $S[ **!key=%childs/*:{@id=xxx}/%text ] @@ | 0",
html2
),
"\"penis\""
);
}
}

2
src/stdlib/imap.rs

@ -232,12 +232,10 @@ impl VValUserData for VImapSession {
Ok(dirs) => dirs,
Err(e) => return Ok(env.new_err(format!("$<IMAP>.list error: {}", e))),
};
println!("DIRS: {:?}", dirs);
let ret = VVal::vec();
for dir in dirs.iter() {
println!("DIRS: {:?}", dir);
ret.push(VVal::new_str(dir.name()));
}

4
src/stdlib/mod.rs

@ -12,6 +12,10 @@ use super::compiler::*;
mod helpers;
mod sqlite;
mod imap;
#[cfg(feature = "html")]
mod html2vval;
#[cfg(feature = "html")]
mod rcdom;
pub use helpers::PendingResult;
pub fn add_to_symtable(st: &mut SymbolTable) {

508
src/stdlib/rcdom.rs

@ -0,0 +1,508 @@
// Copyright 2014-2017 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! A simple reference-counted DOM.
//!
//! This is sufficient as a static parse tree, but don't build a
//! web browser using it. :)
//!
//! A DOM is a [tree structure] with ordered children that can be represented in an XML-like
//! format. For example, the following graph
//!
//! ```text
//! div
//! +- "text node"
//! +- span
//! ```
//! in HTML would be serialized as
//!
//! ```html
//! <div>text node<span></span></div>
//! ```
//!
//! See the [document object model article on wikipedia][dom wiki] for more information.
//!
//! This implementation stores the information associated with each node once, and then hands out
//! refs to children. The nodes themselves are reference-counted to avoid copying - you can create
//! a new ref and then a node will outlive the document. Nodes own their children, but only have
//! weak references to their parents.
//!
//! [tree structure]: https://en.wikipedia.org/wiki/Tree_(data_structure)
//! [dom wiki]: https://en.wikipedia.org/wiki/Document_Object_Model
extern crate markup5ever;
extern crate tendril;
use std::borrow::Cow;
use std::cell::{Cell, RefCell};
use std::collections::HashSet;
use std::default::Default;
use std::fmt;
use std::io;
use std::mem;
use std::rc::{Rc, Weak};
use tendril::StrTendril;
use markup5ever::interface::tree_builder;
use markup5ever::interface::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink};
use markup5ever::serialize::TraversalScope;
use markup5ever::serialize::TraversalScope::{ChildrenOnly, IncludeNode};
use markup5ever::serialize::{Serialize, Serializer};
use markup5ever::Attribute;
use markup5ever::ExpandedName;
use markup5ever::QualName;
/// The different kinds of nodes in the DOM.
#[derive(Debug)]
pub enum NodeData {
/// The `Document` itself - the root node of a HTML document.
Document,
/// A `DOCTYPE` with name, public id, and system id. See
/// [document type declaration on wikipedia][dtd wiki].
///
/// [dtd wiki]: https://en.wikipedia.org/wiki/Document_type_declaration
Doctype {
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
},
/// A text node.
Text { contents: RefCell<StrTendril> },
/// A comment.
Comment { contents: StrTendril },
/// An element with attributes.
Element {
name: QualName,
attrs: RefCell<Vec<Attribute>>,
/// For HTML \<template\> elements, the [template contents].
///
/// [template contents]: https://html.spec.whatwg.org/multipage/#template-contents
template_contents: Option<Handle>,
/// Whether the node is a [HTML integration point].
///
/// [HTML integration point]: https://html.spec.whatwg.org/multipage/#html-integration-point
mathml_annotation_xml_integration_point: bool,
},
/// A Processing instruction.
ProcessingInstruction {
target: StrTendril,
contents: StrTendril,
},
}
/// A DOM node.
pub struct Node {
/// Parent node.
pub parent: Cell<Option<WeakHandle>>,
/// Child nodes of this node.
pub children: RefCell<Vec<Handle>>,
/// Represents this node's data.
pub data: NodeData,
}
impl Node {
/// Create a new node from its contents
pub fn new(data: NodeData) -> Rc<Self> {
Rc::new(Node {
data: data,
parent: Cell::new(None),
children: RefCell::new(Vec::new()),
})
}
}
impl Drop for Node {
fn drop(&mut self) {
let mut nodes = mem::replace(&mut *self.children.borrow_mut(), vec![]);
while let Some(node) = nodes.pop() {
let children = mem::replace(&mut *node.children.borrow_mut(), vec![]);
nodes.extend(children.into_iter());
}
}
}
impl fmt::Debug for Node {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
fmt.debug_struct("Node")
.field("data", &self.data)
.field("children", &self.children)
.finish()
}
}
/// Reference to a DOM node.
pub type Handle = Rc<Node>;
/// Weak reference to a DOM node, used for parent pointers.
pub type WeakHandle = Weak<Node>;
/// Append a parentless node to another nodes' children
fn append(new_parent: &Handle, child: Handle) {
let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
// Invariant: child cannot have existing parent
assert!(previous_parent.is_none());
new_parent.children.borrow_mut().push(child);
}
/// If the node has a parent, get it and this node's position in its children
fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
if let Some(weak) = target.parent.take() {
let parent = weak.upgrade().expect("dangling weak pointer");
target.parent.set(Some(weak));
let i = match parent
.children
.borrow()
.iter()
.enumerate()
.find(|&(_, child)| Rc::ptr_eq(&child, &target))
{
Some((i, _)) => i,
None => panic!("have parent but couldn't find in parent's children!"),
};
Some((parent, i))
} else {
None
}
}
fn append_to_existing_text(prev: &Handle, text: &str) -> bool {
match prev.data {
NodeData::Text { ref contents } => {
contents.borrow_mut().push_slice(text);
true
},
_ => false,
}
}
fn remove_from_parent(target: &Handle) {
if let Some((parent, i)) = get_parent_and_index(target) {
parent.children.borrow_mut().remove(i);
target.parent.set(None);
}
}
/// The DOM itself; the result of parsing.
#[derive(Debug)]
pub struct RcDom {
/// The `Document` itself.
pub document: Handle,
/// Errors that occurred during parsing.
pub errors: Vec<Cow<'static, str>>,
/// The document's quirks mode.
pub quirks_mode: QuirksMode,
}
impl TreeSink for RcDom {
type Output = Self;
fn finish(self) -> Self {
self
}
type Handle = Handle;
fn parse_error(&mut self, msg: Cow<'static, str>) {
self.errors.push(msg);
}
fn get_document(&mut self) -> Handle {
self.document.clone()
}
fn get_template_contents(&mut self, target: &Handle) -> Handle {
if let NodeData::Element {
template_contents: Some(ref contents),
..
} = target.data
{
contents.clone()
} else {
panic!("not a template element!")
}
}
fn set_quirks_mode(&mut self, mode: QuirksMode) {
self.quirks_mode = mode;
}
fn same_node(&self, x: &Handle, y: &Handle) -> bool {
Rc::ptr_eq(x, y)
}
fn elem_name<'a>(&self, target: &'a Handle) -> ExpandedName<'a> {
return match target.data {
NodeData::Element { ref name, .. } => name.expanded(),
_ => panic!("not an element!"),
};
}
fn create_element(
&mut self,
name: QualName,
attrs: Vec<Attribute>,
flags: ElementFlags,
) -> Handle {
Node::new(NodeData::Element {
name: name,
attrs: RefCell::new(attrs),
template_contents: if flags.template {
Some(Node::new(NodeData::Document))
} else {
None
},
mathml_annotation_xml_integration_point: flags.mathml_annotation_xml_integration_point,
})
}
fn create_comment(&mut self, text: StrTendril) -> Handle {
Node::new(NodeData::Comment { contents: text })
}
fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Handle {
Node::new(NodeData::ProcessingInstruction {
target: target,
contents: data,
})
}
fn append(&mut self, parent: &Handle, child: NodeOrText<Handle>) {
// Append to an existing Text node if we have one.
match child {
NodeOrText::AppendText(ref text) => match parent.children.borrow().last() {
Some(h) => {
if append_to_existing_text(h, &text) {
return;
}
},
_ => (),
},
_ => (),
}
append(
&parent,
match child {
NodeOrText::AppendText(text) => Node::new(NodeData::Text {
contents: RefCell::new(text),
}),
NodeOrText::AppendNode(node) => node,
},
);
}
fn append_before_sibling(&mut self, sibling: &Handle, child: NodeOrText<Handle>) {
let (parent, i) = get_parent_and_index(&sibling)
.expect("append_before_sibling called on node without parent");
let child = match (child, i) {
// No previous node.
(NodeOrText::AppendText(text), 0) => Node::new(NodeData::Text {
contents: RefCell::new(text),
}),
// Look for a text node before the insertion point.
(NodeOrText::AppendText(text), i) => {
let children = parent.children.borrow();
let prev = &children[i - 1];
if append_to_existing_text(prev, &text) {
return;
}
Node::new(NodeData::Text {
contents: RefCell::new(text),
})
},
// The tree builder promises we won't have a text node after
// the insertion point.
// Any other kind of node.
(NodeOrText::AppendNode(node), _) => node,
};
remove_from_parent(&child);
child.parent.set(Some(Rc::downgrade(&parent)));
parent.children.borrow_mut().insert(i, child);
}
fn append_based_on_parent_node(
&mut self,
element: &Self::Handle,
prev_element: &Self::Handle,
child: NodeOrText<Self::Handle>,
) {
let parent = element.parent.take();
let has_parent = parent.is_some();
element.parent.set(parent);
if has_parent {
self.append_before_sibling(element, child);
} else {
self.append(prev_element, child);
}
}
fn append_doctype_to_document(
&mut self,
name: StrTendril,
public_id: StrTendril,
system_id: StrTendril,
) {
append(
&self.document,
Node::new(NodeData::Doctype {
name: name,
public_id: public_id,
system_id: system_id,
}),
);
}
fn add_attrs_if_missing(&mut self, target: &Handle, attrs: Vec<Attribute>) {
let mut existing = if let NodeData::Element { ref attrs, .. } = target.data {
attrs.borrow_mut()
} else {
panic!("not an element")
};
let existing_names = existing
.iter()
.map(|e| e.name.clone())
.collect::<HashSet<_>>();
existing.extend(
attrs
.into_iter()
.filter(|attr| !existing_names.contains(&attr.name)),
);
}
fn remove_from_parent(&mut self, target: &Handle) {
remove_from_parent(&target);
}
fn reparent_children(&mut self, node: &Handle, new_parent: &Handle) {
let mut children = node.children.borrow_mut();
let mut new_children = new_parent.children.borrow_mut();
for child in children.iter() {
let previous_parent = child.parent.replace(Some(Rc::downgrade(&new_parent)));
assert!(Rc::ptr_eq(
&node,
&previous_parent.unwrap().upgrade().expect("dangling weak")
))
}
new_children.extend(mem::replace(&mut *children, Vec::new()));
}
fn is_mathml_annotation_xml_integration_point(&self, target: &Handle) -> bool {
if let NodeData::Element {
mathml_annotation_xml_integration_point,
..
} = target.data
{
mathml_annotation_xml_integration_point
} else {
panic!("not an element!")
}
}
}
impl Default for RcDom {
fn default() -> RcDom {
RcDom {
document: Node::new(NodeData::Document),
errors: vec![],
quirks_mode: tree_builder::NoQuirks,
}
}
}
enum SerializeOp {
Open(Handle),
Close(QualName),
}
pub struct SerializableHandle(Handle);
impl From<Handle> for SerializableHandle {
fn from(h: Handle) -> SerializableHandle {
SerializableHandle(h)
}
}
impl Serialize for SerializableHandle {
fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
where
S: Serializer,
{
let mut ops = match traversal_scope {
IncludeNode => vec![SerializeOp::Open(self.0.clone())],
ChildrenOnly(_) => self
.0
.children
.borrow()
.iter()
.map(|h| SerializeOp::Open(h.clone()))
.collect(),
};
while !ops.is_empty() {
match ops.remove(0) {
SerializeOp::Open(handle) => match &handle.data {
&NodeData::Element {
ref name,
ref attrs,
..
} => {
serializer.start_elem(
name.clone(),
attrs.borrow().iter().map(|at| (&at.name, &at.value[..])),
)?;
ops.insert(0, SerializeOp::Close(name.clone()));
for child in handle.children.borrow().iter().rev() {
ops.insert(0, SerializeOp::Open(child.clone()));
}
},
&NodeData::Doctype { ref name, .. } => serializer.write_doctype(&name)?,
&NodeData::Text { ref contents } => {
serializer.write_text(&contents.borrow())?
},
&NodeData::Comment { ref contents } => serializer.write_comment(&contents)?,
&NodeData::ProcessingInstruction {
ref target,
ref contents,
} => serializer.write_processing_instruction(target, contents)?,
&NodeData::Document => panic!("Can't serialize Document node itself"),
},
SerializeOp::Close(name) => {
serializer.end_elem(name)?;
},
}
}
Ok(())
}
}

26
src/stdlib/util.rs

@ -57,4 +57,30 @@ pub fn add_to_symtable(st: &mut SymbolTable) {
Some(1),
false,
);
#[cfg(feature = "html")]
st.fun(
"html:parse_simple",
|env: &mut Env, _argc: usize| {
env.arg(0).with_s_ref(|s| {
Ok(super::html2vval::parse_simplified(s))
})
},
Some(1),
Some(1),
false,
);
#[cfg(feature = "html")]
st.fun(
"html:parse_complex",
|env: &mut Env, _argc: usize| {
env.arg(0).with_s_ref(|s| {
Ok(super::html2vval::parse(s, true))
})
},
Some(1),
Some(1),
false,
);
}

Loading…
Cancel
Save