index 0000000..dfc7f75
--- /dev/null
+++ b/src/parse.rs
@@ -0,0 +1,497 @@
+use std::collections::HashMap;
+use crate::directives::expand_directive;
+pub type Attributes = HashMap;
+pub type Offset = usize;
+pub type Parse<'a, T> = (T, &'a str, Offset);
+pub type MaybeParse<'a, T> = Option>;
+fn parse_until(i: &str, condition: impl Fn(char) -> bool) -> Parse<&str> {
+ match i.chars().position(condition) {
+ Some(pos) => (&i[..pos], &i[pos..], pos),
+ None => (&i, "", i.len()),
+ }
+fn parse_until_str<'a>(
+ tail: &'a str,
+ to_match: &'static str,
+) -> MaybeParse<'a, &'a str> {
+ for i in 0..tail.len() {
+ let substr = &tail[0..i];
+ if substr.ends_with(to_match) {
+ let end = i - to_match.len();
+ return Some((&tail[0..end], &tail[end..], end));
+ }
+ }
+ None
+fn parse_str<'a>(i: &'a str, matches: &str) -> MaybeParse<'a, &'a str> {
+ let length = matches.len();
+ match i.get(0..matches.len()) {
+ Some(s) => {
+ if s.eq_ignore_ascii_case(matches) {
+ Some((s, &i[length..], length))
+ } else {
+ None
+ }
+ },
+ None => None,
+ }
+fn parse_char(i: &str, matches: char) -> MaybeParse {
+ if let Some(c) = i.chars().next() {
+ if c == matches {
+ return Some((c, &i[1..], 1));
+ }
+ }
+ None
+fn parse_delimited(i: &str, delim: char) -> MaybeParse<&str> {
+ let (_start, i, o1) = parse_char(i, delim)?;
+ let (contents, i, o2) = parse_until(i, |c| c == delim);
+ let (_end, i, o3) = parse_char(i, delim)?;
+ Some((contents, i, o1 + o2 + o3))
+fn index_to_rc(input: &str, index: usize) -> (usize, usize) {
+ let (mut row, mut col) = (1, 1);
+ for c in input[0..index].chars() {
+ if c == '\n' {
+ row += 1;
+ col = 1;
+ } else {
+ col += 1;
+ }
+ }
+ (row, col)
+pub const LEXEME_MEMORY_LIMIT: usize = 65535;
+// Tags that are implicitly self closing, ending in /> is
+// optional
+const VOID_ELEMENTS: [&str; 16] = [
+ "area", "base", "br", "col", "command", "embed", "hr", "img", "input",
+ "keygen", "link", "meta", "param", "source", "track", "wbr",
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+ /// Encountered illegal sequence
+ Illegal,
+ /// Closing tag does not match previous opening tag
+ UnbalancedTags,
+ /// Tried to parse `N > 65535` elements
+ MemoryLimit,
+impl From for crate::trace::Error {
+ fn from(value: Error) -> Self {
+ Self {
+ kind: crate::trace::ErrorKind::Parsing,
+ reason: match value.kind {
+ ErrorKind::Illegal => "Illegal character encountered",
+ ErrorKind::UnbalancedTags => "Unbalanced open and close tags",
+ ErrorKind::MemoryLimit => "Ran out of memory",
+ }
+ .to_string(),
+ backtrace: vec![format!("at line {} column {}", value.row, value.column)],
+ }
+ }
+#[derive(Clone, Debug)]
+pub struct Error {
+ pub kind: ErrorKind,
+ pub char_index: usize,
+ pub row: usize,
+ pub column: usize,
+#[derive(Clone, Debug)]
+pub enum HtmlElement {
+ /// The required `` preamble
+ DocType,
+ /// Text inside of a ``
+ Comment(String),
+ /// Any opening tag, including tags
+ OpenTag {
+ /// The name of the tag
+ name: String,
+ /// Attribute names and their values if present
+ attributes: Attributes,
+ /// Whether the tag closes itself. Tags that are
+ /// implicitly empty are:
+ /// `area, base, br, col, command, embed, hr, img, input
+ /// keygen, link, meta, param, source, track, wbr`
+ is_empty: bool,
+ },
+ /// Any closing tag that is not empty. Closing implicitly
+ /// empty tags is an error
+ CloseTag { name: String },
+ /// Any inner text that is not entirely whitespace
+ Text(String),
+ /// A `",
+ serialize_attributes(attributes),
+ contents
+ )
+ },
+ Self::Text(t) => t.clone(),
+ Self::Directive {
+ name,
+ attributes,
+ contents,
+ } => expand_directive(&name, &attributes, &contents),
+ }
+ }
+/// Used as the `condition` argument for `parse_until` to
+/// parse names of things
+const NAME_REGEX: fn(char) -> bool =
+ |c| !(c.is_ascii_alphanumeric() || [':', '_', '-', '@'].contains(&c));
+/// Used as the `condition` argument for `parse_until` to
+/// arbitrary whitespace
+const WS_REGEX: fn(char) -> bool = |c| !c.is_whitespace();
+fn parse_doctype(i: &str) -> MaybeParse {
+ let (_, i, o1) = parse_str(i, "")?;
+ Some((HtmlElement::DocType, i, o1 + o2 + o3 + o4 + o5))
+fn parse_comment<'a>(tail: &'a str) -> MaybeParse {
+ let (_, tail, o1) = parse_str(tail, "")?;
+ let (_, tail, o3) = parse_str(tail, "-->")?;
+ Some((HtmlElement::Comment(comment.into()), tail, o1 + o2 + o3))
+fn parse_raw_text(i: &str) -> MaybeParse<(String, Attributes, String)> {
+ let (open, mut i, o1) = parse_open_tag(i)?;
+ let (open_name, attributes, is_empty) = match open {
+ HtmlElement::OpenTag {
+ name,
+ attributes,
+ is_empty,
+ } => (name, attributes, is_empty),
+ _ => unreachable!(),
+ };
+ let (close_name, contents, i, o2) = if is_empty {
+ (open_name.clone(), "".to_string(), i, 0)
+ } else {
+ let mut contents = String::new();
+ let mut o2 = 0;
+ while !i.is_empty() {
+ let (text, new_i, new_off) = parse_text(i);
+ contents.push_str(&text);
+ o2 += new_off;
+ i = new_i;
+ if i.starts_with("") {
+ if let Some((HtmlElement::CloseTag { name }, _, _)) = parse_close_tag(i)
+ {
+ if name == open_name {
+ break;
+ }
+ }
+ }
+ if text.is_empty() {
+ i = &i[1..];
+ o2 += 1;
+ contents.push('<');
+ }
+ }
+ let (close, i, o3) = parse_close_tag(i)?;
+ let close_name = match close {
+ HtmlElement::CloseTag { name } => name,
+ _ => unreachable!(),
+ };
+ (close_name, contents, i, o2 + o3)
+ };
+ if open_name != close_name {
+ return None;
+ }
+ Some(((open_name, attributes, contents), i, o1 + o2))
+fn parse_style(i: &str) -> MaybeParse {
+ let ((name, attributes, contents), i, o) = parse_raw_text(i)?;
+ if name != "style" {
+ None
+ } else {
+ Some((
+ HtmlElement::Style {
+ attributes,
+ contents,
+ },
+ i,
+ o,
+ ))
+ }
+fn parse_script(i: &str) -> MaybeParse {
+ let ((name, attributes, contents), i, o) = parse_raw_text(i)?;
+ if name != "script" {
+ None
+ } else {
+ Some((
+ HtmlElement::Script {
+ attributes,
+ contents,
+ },
+ i,
+ o,
+ ))
+ }
+fn parse_directive(i: &str) -> MaybeParse {
+ let ((name, attributes, contents), i, o) = parse_raw_text(i)?;
+ if let Some(name) = name.strip_prefix('@') {
+ Some((
+ HtmlElement::Directive {
+ name: name.to_string(),
+ attributes,
+ contents,
+ },
+ i,
+ o,
+ ))
+ } else {
+ None
+ }
+fn parse_open_tag(i: &str) -> MaybeParse {
+ let (_, i, o1) = parse_str(i, "<")?;
+ let (name, i, o2) = parse_until(i, NAME_REGEX);
+ let mut attributes = HashMap::new();
+ let mut i = i;
+ let mut o3 = 0;
+ while let Some(((key, value), new_i, new_o)) = parse_attribute(i) {
+ attributes.insert(key, value);
+ i = new_i;
+ o3 += new_o;
+ }
+ let (_, i, o4) = parse_until(i, WS_REGEX);
+ // Find all attributes
+ let (is_empty, i, o5) = match parse_str(i, "/") {
+ Some((_, i, o5)) => (true, i, o5),
+ None => (false, i, 0),
+ };
+ let is_empty = is_empty || VOID_ELEMENTS.contains(&name);
+ let (_, i, o6) = parse_char(i, '>')?;
+ Some((
+ HtmlElement::OpenTag {
+ name: name.to_string(),
+ attributes,
+ is_empty,
+ },
+ i,
+ o1 + o2 + o3 + o4 + o5 + o6,
+ ))
+fn parse_attribute(i: &str) -> MaybeParse<(String, String)> {
+ let (_, i, o1) = parse_until(i, WS_REGEX);
+ if o1 == 0 {
+ return None;
+ }
+ let (key, i, o2) = parse_until(i, NAME_REGEX);
+ if o2 == 0 {
+ return None;
+ }
+ let get_value = || -> Option<(&str, &str, Offset)> {
+ let (_, i, o1) = parse_until(i, WS_REGEX);
+ let (_, i, o2) = parse_str(i, "=")?;
+ let (_, i, o3) = parse_until(i, WS_REGEX);
+ let (value, i, o4) = parse_delimited(i, '\"')
+ .or_else(|| parse_delimited(i, '\''))
+ .unwrap_or_else(|| parse_until(i, NAME_REGEX));
+ Some((value, i, o1 + o2 + o3 + o4))
+ };
+ let (value, i, o3) = get_value().unwrap_or(("", i, 0));
+ Some(((key.to_string(), value.to_string()), i, o1 + o2 + o3))
+fn parse_close_tag(i: &str) -> MaybeParse {
+ let (_, i, o1) = parse_str(i, "")?;
+ let (name, i, o2) = parse_until(i, NAME_REGEX);
+ if o2 == 0 {
+ return None;
+ }
+ let (_, i, o3) = parse_until(i, WS_REGEX);
+ let (_, i, o4) = parse_str(i, ">")?;
+ Some((
+ HtmlElement::CloseTag {
+ name: name.to_string(),
+ },
+ i,
+ o1 + o2 + o3 + o4,
+ ))
+fn parse_text(i: &str) -> Parse {
+ let (text, i, o1) = parse_until(i, |c| c == '<');
+ (text.to_string(), i, o1)
+pub fn parse_html(
+ input: &str,
+) -> Result, crate::trace::Error> {
+ let mut output = vec![];
+ let mut validation_stack = vec![];
+ let mut i = input;
+ let mut offset = 0;
+ let throw_err = |kind, offset| {
+ let (row, column) = index_to_rc(input, offset);
+ Error {
+ kind,
+ char_index: offset,
+ row,
+ column,
+ }
+ .into()
+ };
+ while i.len() > 0 {
+ let (lm, new_i, new_off) = if i.starts_with("", &mut new_index).ok_or(ParseError::Unknown)?;
- *index += new_index;
- Ok((Lexeme::Comment, tail))
-fn parse_text<'a>(
- tail: &'a str,
- index: &mut usize,
-) -> Result<(Lexeme<'a>, &'a str), ParseError> {
- let mut new_index = 0;
- let (txt, tail) = parse_while(tail, |c| c != '<', &mut new_index);
- if txt.is_empty() {
- Err(ParseError::Unknown)
- } else {
- *index += new_index;
- Ok((Lexeme::Text(txt), tail))
- }
-fn or_keep_error<'a>(
- r: Result<(Lexeme<'a>, &'a str), ParseError>,
- op: impl FnOnce() -> Result<(Lexeme<'a>, &'a str), ParseError>,
-) -> Result<(Lexeme<'a>, &'a str), ParseError> {
- match r {
- Ok(val) => Ok(val),
- Err(e) => op().map_err(|new_e| match e {
- ParseError::Unknown => new_e,
- _ => e,
- }),
- }
-fn index_to_rc(input: &str, index: usize) -> (usize, usize) {
- let (mut row, mut col) = (1, 1);
- for c in input[0..index].chars() {
- if c == '\n' {
- row += 1;
- col = 1;
- } else {
- col += 1;
- }
- }
- (row, col)
-pub fn parse_html(input: &str) -> trace::Result> {
- let mut tail = input;
- let mut lexeme_stack = vec![];
- let mut validation_stack = vec![];
- let mut index = 0;
- let err = |error: ParseError, index: usize| -> trace::Result> {
- let (row, col) = index_to_rc(input, index);
- let e: trace::Error = error.into();
- Err(e).ctx(format!("Starting at line {} character {}", row, col))
- };
- while !tail.is_empty() {
- let (_, new_tail) = parse_whitespace(tail, &mut index);
- if new_tail.is_empty() {
- break;
- }
- let result = or_keep_error(parse_open_tag(new_tail, &mut index), || {
- parse_close_tag(new_tail, &mut index)
- });
- let result = or_keep_error(result, || parse_text(new_tail, &mut index));
- let result = or_keep_error(result, || parse_comment(new_tail, &mut index));
- let (lm, new_tail) =
- match or_keep_error(result, || parse_doctype(new_tail, &mut index)) {
- Ok(v) => v,
- Err(e) => {
- return err(e, index);
- },
- };
- // Validate that open and close tags match
- match lm {
- Lexeme::OpenTag { name, is_void, .. } => {
- if !is_void {
- validation_stack.push(name);
- }
- },
- Lexeme::CloseTag { name } => {
- if VOID_ELEMENTS.contains(&name) {
- return err(ParseError::VoidClosingTag(name.into()).into(), index);
- }
- if let Some(top) = validation_stack.pop() {
- if name != top {
- return err(
- ParseError::MismatchedClosing {
- expected: top.into(),
- found: name.into(),
- },
- index,
- );
- }
- } else {
- return err(ParseError::UnmatchedClose(name.into()), index);
- }
- },
- Lexeme::Comment => {
- tail = new_tail;
- continue;
- },
- _ => {},
- };
- lexeme_stack.push(lm);
- tail = new_tail;
- }
- if let Some(top) = validation_stack.pop() {
- let e: trace::Error = ParseError::UnmatchedOpen(top.into()).into();
- return Err(e).ctx("At end of file");
- }
- Ok(lexeme_stack)
diff --git a/src/trace.rs b/src/trace.rs
- 0
- Remember Clear
- history