Skip to content

Commit

Permalink
Migrate to kuchiki
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark-Simulacrum committed Jun 6, 2020
1 parent 4e76142 commit 6fee946
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 252 deletions.
100 changes: 0 additions & 100 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ badge = { path = "src/web/badge" }
failure = "0.1.3"
comrak = { version = "0.3", default-features = false }
toml = "0.5"
html5ever = "0.22"
kuchiki = "0.8"
schemamama = "0.3"
schemamama_postgres = "0.2"
rusoto_s3 = "0.40"
Expand Down Expand Up @@ -77,7 +77,6 @@ features = ["with-chrono", "with-serde_json"]

[dev-dependencies]
once_cell = "1.2.0"
kuchiki = "0.8"
criterion = "0.3"
rand = "0.7.3"

Expand Down
95 changes: 27 additions & 68 deletions src/utils/html.rs
Original file line number Diff line number Diff line change
@@ -1,78 +1,38 @@
use crate::error::Result;
use failure::err_msg;

use html5ever::driver::{parse_document, ParseOpts};
use html5ever::rcdom::{Handle, NodeData, RcDom};
use html5ever::serialize::{serialize, SerializeOpts};
use html5ever::tendril::TendrilSink;
use kuchiki::traits::TendrilSink;
use kuchiki::NodeRef;

/// Extracts the contents of the `<head>` and `<body>` tags from an HTML document, as well as the
/// classes on the `<body>` tag, if any.
pub fn extract_head_and_body(html: &str) -> Result<(String, String, String)> {
let parser = parse_document(RcDom::default(), ParseOpts::default());
let dom = parser.one(html);

let (head, body) = extract_from_rcdom(&dom)?;
let class = extract_class(&body);

Ok((stringify(head), stringify(body), class))
let dom = kuchiki::parse_html().one(html);

let head = dom
.select_first("head")
.map_err(|_| err_msg("couldn't find <head> tag in rustdoc output"))?;
let body = dom
.select_first("body")
.map_err(|_| err_msg("couldn't find <body> tag in rustdoc output"))?;

let class = body
.attributes
.borrow()
.get("class")
.map(|v| v.to_owned())
.unwrap_or_default();

Ok((serialize(head.as_node()), serialize(body.as_node()), class))
}

fn extract_from_rcdom(dom: &RcDom) -> Result<(Handle, Handle)> {
let mut worklist = vec![dom.document.clone()];
let (mut head, mut body) = (None, None);

while let Some(handle) = worklist.pop() {
if let NodeData::Element { ref name, .. } = handle.data {
match name.local.as_ref() {
"head" => {
if head.is_some() {
return Err(err_msg("duplicate <head> tag"));
} else {
head = Some(handle.clone());
}
}

"body" => {
if body.is_some() {
return Err(err_msg("duplicate <body> tag"));
} else {
body = Some(handle.clone());
}
}

_ => {} // do nothing
}
}

worklist.extend(handle.children.borrow().iter().cloned());
}

let head = head.ok_or_else(|| err_msg("couldn't find <head> tag in rustdoc output"))?;
let body = body.ok_or_else(|| err_msg("couldn't find <body> tag in rustdoc output"))?;
Ok((head, body))
}

fn stringify(node: Handle) -> String {
let mut vec = Vec::new();
serialize(&mut vec, &node, SerializeOpts::default()).expect("serializing into buffer failed");

String::from_utf8(vec).expect("html5ever returned non-utf8 data")
}

fn extract_class(node: &Handle) -> String {
match node.data {
NodeData::Element { ref attrs, .. } => {
let attrs = attrs.borrow();

attrs
.iter()
.find(|a| &a.name.local == "class")
.map_or(String::new(), |a| a.value.to_string())
}

_ => String::new(),
fn serialize(v: &NodeRef) -> String {
let mut contents = Vec::new();
for child in v.children() {
child
.serialize(&mut contents)
.expect("serialization failed");
}
String::from_utf8(contents).expect("non utf-8 html")
}

#[cfg(test)]
Expand All @@ -82,8 +42,7 @@ mod test {
let (head, body, class) = super::extract_head_and_body(
r#"<head><meta name="generator" content="rustdoc"></head><body class="rustdoc struct"><p>hello</p>"#
).unwrap();

assert_eq!(head, r#"<meta name="generator" content="rustdoc">"#);
assert_eq!(head, r#"<meta content="rustdoc" name="generator">"#);
assert_eq!(body, "<p>hello</p>");
assert_eq!(class, "rustdoc struct");
}
Expand Down
2 changes: 1 addition & 1 deletion src/web/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ mod test {
test::*,
web::{handlebars_engine, match_version},
};
use html5ever::tendril::TendrilSink;
use kuchiki::traits::TendrilSink;
use serde_json::json;

fn release(version: &str, db: &TestDatabase) -> i32 {
Expand Down
4 changes: 1 addition & 3 deletions src/web/rustdoc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ mod test {
use super::*;
use crate::test::*;
use chrono::Utc;
use kuchiki::traits::TendrilSink;
use reqwest::StatusCode;
use serde_json::json;
use std::{collections::BTreeMap, iter::FromIterator};
Expand All @@ -575,7 +576,6 @@ mod test {
path: &str,
web: &TestFrontend,
) -> Result<Option<String>, failure::Error> {
use html5ever::tendril::TendrilSink;
assert_success(path, web)?;
let data = web.get(path).send()?.text()?;
println!("{}", data);
Expand Down Expand Up @@ -872,7 +872,6 @@ mod test {
#[test]
fn yanked_release_shows_warning_in_nav() {
fn has_yanked_warning(path: &str, web: &TestFrontend) -> Result<bool, failure::Error> {
use html5ever::tendril::TendrilSink;
assert_success(path, web)?;
let data = web.get(path).send()?.text()?;
Ok(kuchiki::parse_html()
Expand Down Expand Up @@ -1083,7 +1082,6 @@ mod test {
path: &str,
web: &TestFrontend,
) -> Result<Vec<(String, String)>, failure::Error> {
use html5ever::tendril::TendrilSink;
assert_success(path, web)?;
let data = web.get(path).send()?.text()?;
let dom = kuchiki::parse_html().one(data);
Expand Down
Loading

0 comments on commit 6fee946

Please sign in to comment.