spring break work

This commit is contained in:
voidNUL 2024-03-16 18:27:14 -05:00
parent 5cef0b4563
commit dd7995ac31
7 changed files with 385 additions and 1165 deletions

1073
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -7,7 +7,5 @@ edition = "2021"
[dependencies]
html_parser = "0.7"
minify-html = "0.15"
grass = "0.13"
toml = "0.8"
walkdir = "2.5"

View file

@ -1,12 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<link>
<link "asdf">
<lg:include rel='css' href="./style.css" />
</head>
<body>
<A href="fdfs"> asdf </A>
</body>
</html>

View file

@ -67,7 +67,7 @@ fn run_compiler() -> Result<()> {
fn main() {
let test = include_str!("../simple.html").to_string();
// let test = " <as> </as> ".to_string();
let r = parser::parse_html(&test);
let r = parser::parse_html(&test).unwrap();
for l in r {
println!("{l:?}");
}

View file

@ -1,8 +1,48 @@
use std::collections::HashMap;
use crate::trace::*;
#[derive(Clone, Debug)]
pub enum ParseError {
InvalidTag,
MismatchedClosing { expected: String, found: String },
UnmatchedOpen(String),
UnmatchedClose(String),
VoidClosingTag(String),
Unknown,
}
use crate::trace::{self, WithContext};
impl From<ParseError> for trace::Error {
fn from(value: ParseError) -> Self {
let msg = match value {
ParseError::InvalidTag => "Failed to parse a tag".into(),
ParseError::MismatchedClosing { expected, found } => {
format!(
"Found closing tag '{}' where '{}' was expected",
found, expected
)
},
ParseError::UnmatchedOpen(s) => {
format!("The tag '{}' is opened, but never closed", s)
},
ParseError::UnmatchedClose(s) => {
format!("The tag '{}' is closed, but never opened", s)
},
ParseError::VoidClosingTag(s) => {
format!("The tag '{}' should not have a closing tag", s)
},
ParseError::Unknown => {
return trace::Error::new(
trace::ErrorKind::Unknown,
"Unknown error while parsing",
)
},
};
trace::Error::new(trace::ErrorKind::Parsing, msg)
}
}
#[derive(Clone, Debug, PartialEq)]
pub enum Lexeme<'a> {
OpenTag {
name: &'a str,
@ -12,33 +52,82 @@ pub enum Lexeme<'a> {
CloseTag {
name: &'a str,
},
Content(&'a str),
Text(&'a str),
Doctype,
Comment,
}
fn normalize_whitespace(s: &str) {
fn normalize_whitespace(mut tail: &str) -> String {
// https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
todo!()
}
fn error(message: impl Into<String>) -> Error {
Error {
kind: ErrorKind::Parsing,
reason: message.into(),
backtrace: vec![],
let mut _index = 0;
let mut buffer = String::with_capacity(tail.len());
while !tail.is_empty() {
match parse_whitespace_min(tail, 1, &mut _index) {
Some((_, new_tail)) => {
buffer.push(' ');
tail = new_tail;
},
None => {},
}
let (chars, new_tail) =
parse_while(tail, |c| !c.is_whitespace(), &mut _index);
buffer.push_str(chars);
tail = new_tail
}
buffer
}
/// Try parsing single specific character
fn parse_char(i: &str, c: char) -> Option<(&str, &str)> {
if i.starts_with(c) {
Some((&i[0..1], &i[1..]))
/// Try parsing single specific character ignoring case
fn parse_char<'a>(
tail: &'a str,
c: char,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
if !tail.is_empty() && tail[0..1].eq_ignore_ascii_case(&c.to_string()) {
*index += 1;
Some((&tail[0..1], &tail[1..]))
} else {
None
}
}
// Parse until condition is not true for next character
fn parse_while(tail: &str, condition: impl Fn(char) -> bool) -> (&str, &str) {
fn parse_str<'a>(
tail: &'a str,
to_match: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
if tail.len() < to_match.len() {
return None;
}
if tail[0..to_match.len()].eq_ignore_ascii_case(to_match) {
*index += to_match.len();
Some((&tail[0..to_match.len()], &tail[to_match.len()..]))
} else {
None
}
}
fn parse_until_str<'a>(
tail: &'a str,
to_match: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
for i in 0..tail.len() {
let substr = &tail[0..i];
if substr.ends_with(to_match) {
*index += i;
return Some((&tail[0..i], &tail[i..]));
}
}
None
}
/// Parse until condition is not true for next character
fn parse_while<'a>(
tail: &'a str,
condition: impl Fn(char) -> bool,
index: &mut usize,
) -> (&'a str, &'a str) {
let mut end;
let mut it = tail.char_indices();
'outer: loop {
@ -55,105 +144,166 @@ fn parse_while(tail: &str, condition: impl Fn(char) -> bool) -> (&str, &str) {
},
};
}
*index += end;
(&tail[0..end], &tail[end..])
}
fn parse_whitespace(i: &str) -> (&str, &str) {
parse_while(i, |c| c.is_whitespace())
fn parse_whitespace<'a>(i: &'a str, index: &mut usize) -> (&'a str, &'a str) {
parse_while(i, |c| c.is_whitespace(), index)
}
fn parse_doctype(tail: &str) -> Option<(&str, &str)> {
const doctype_str = "<!DOCTYPE>"
fn parse_whitespace_min<'a>(
tail: &'a str,
min: usize,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (ws, tail) = parse_whitespace(tail, &mut new_index);
if ws.len() < min {
None
} else {
*index += new_index;
Some((ws, tail))
}
}
/// Try parsing all characters between two delimiter
/// characters
fn parse_delimited(i: &str, delimiter: char) -> Option<(&str, &str)> {
let (_, tail) = parse_char(i, delimiter)?;
let (value, tail) = parse_while(tail, |c| c != delimiter);
let (_, tail) = parse_char(tail, delimiter)?;
fn parse_delimited<'a>(
i: &'a str,
delimiter: char,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (_, tail) = parse_char(i, delimiter, &mut new_index)?;
let (value, tail) = parse_while(tail, |c| c != delimiter, &mut new_index);
let (_, tail) = parse_char(tail, delimiter, &mut new_index)?;
*index += new_index;
Some((value, tail))
}
fn parse_tag_name(i: &str) -> Option<(&str, &str)> {
let (value, tail) = parse_while(i, |c| c.is_ascii_alphanumeric() || c == ':');
fn parse_tag_name<'a>(
i: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (value, tail) = parse_while(
i,
|c| c.is_ascii_alphanumeric() || [':', '_', '-'].contains(&c),
&mut new_index,
);
if value.is_empty() {
None
} else {
*index += new_index;
Some((value, tail))
}
}
fn parse_attribute_key(i: &str) -> Option<(&str, &str)> {
let (value, tail) = parse_while(i, |c| {
!(['"', '\'', '>', '/', '='].contains(&c) || c.is_control())
});
fn parse_attribute_key<'a>(
i: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
let mut new_index = 0;
let (value, tail) = parse_while(
i,
|c| {
!(['"', '\'', '>', '/', '='].contains(&c)
|| c.is_control()
|| c.is_whitespace())
},
&mut new_index,
);
if value.is_empty() {
None
} else {
*index += new_index;
Some((value, tail))
}
}
fn parse_attribute_val(i: &str) -> Option<(&str, &str)> {
fn parse_attribute_val<'a>(
i: &'a str,
index: &mut usize,
) -> Option<(&'a str, &'a str)> {
const SINGLE_QUOTE: char = '\'';
const DOUBLE_QUOTE: char = '"';
let (value, tail) = parse_delimited(i, '\'') // Single quote delimit
.or_else(|| parse_delimited(i, '"')) // Double quote delimit
let mut new_index = 0;
let (value, tail) =
parse_delimited(i, SINGLE_QUOTE, &mut new_index) // Single quote delimit
.or_else(|| parse_delimited(i, DOUBLE_QUOTE, &mut new_index)) // Double quote delimit
.or_else(|| { // Unquoted
Some(parse_while(i, |c| {
!(c.is_whitespace()
|| [SINGLE_QUOTE, DOUBLE_QUOTE, '=', '<', '>', '`'].contains(&c))
}))
}, &mut new_index))
})?;
if value.is_empty() {
None
} else {
Some((value, tail))
}
*index += new_index;
Some((value, tail))
}
/// Returns Option<((key, value), tail)>
fn parse_key_val(tail: &str) -> Option<((&str, Option<&str>), &str)> {
fn parse_key_val<'a>(
tail: &'a str,
index: &mut usize,
) -> Option<((&'a str, Option<&'a str>), &'a str)> {
let mut new_index = 0;
// Require whitespace
let (ws, tail) = parse_whitespace(tail);
if ws.is_empty() {
return None;
}
let (_, tail) = parse_whitespace_min(tail, 1, &mut new_index)?;
// Fail when no key found
let (key, tail) = parse_attribute_key(tail)?;
let (_, tail) = parse_whitespace(tail);
if let Some((_, tail)) = parse_char(tail, '=') {
let (_, tail) = parse_whitespace(tail);
let (key, tail) = parse_attribute_key(tail, &mut new_index)?;
if let Some((_, tail)) = parse_char(
parse_whitespace(tail, &mut new_index).1,
'=',
&mut new_index,
) {
let (_, tail) = parse_whitespace(tail, &mut new_index);
// Fail when = is not followed by value
let (val, tail) = parse_attribute_val(tail)?;
Some(((key, Some(val)), tail))
let (val, tail) = parse_attribute_val(tail, &mut new_index)?;
let val = if val.is_empty() { None } else { Some(val) };
*index += new_index;
Some(((key, val), tail))
} else {
*index += new_index;
Some(((key, None), tail))
}
}
// Tags that are implicitly self closing, ending in /> is optional
const VOID_ELEMENTS: [&str; 16] = [
"area", "base", "br", "col", "command", "embed", "hr", "img", "input",
"keygen", "link", "meta", "param", "source", "track", "wbr",
];
fn parse_open_tag(tail: &str) -> Option<(Lexeme, &str)> {
fn parse_open_tag<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
// <
let (_, tail) = parse_char(tail, '<')?;
let (_, tail) =
parse_char(tail, '<', &mut new_index).ok_or(ParseError::Unknown)?;
// tag name
let (name, mut tail) = parse_tag_name(tail)?;
let (name, mut tail) =
parse_tag_name(tail, &mut new_index).ok_or(ParseError::InvalidTag)?;
// attributes
let mut attributes: HashMap<&str, Option<&str>> = HashMap::new();
while let Some((kv, new_tail)) = parse_key_val(tail) {
while let Some((kv, new_tail)) = parse_key_val(tail, &mut new_index) {
attributes.insert(kv.0, kv.1);
tail = new_tail;
}
let (_, tail) = parse_whitespace(tail);
let (is_void, tail) = parse_char(tail, '/').unwrap_or(("", tail));
let (_, tail) = parse_whitespace(tail, &mut new_index);
let (is_void, tail) =
parse_char(tail, '/', &mut new_index).unwrap_or(("", tail));
let is_void = !is_void.is_empty() || VOID_ELEMENTS.contains(&name);
let (_, tail) = parse_char(tail, '>')?;
Some((
let (_, tail) = match parse_char(tail, '>', &mut new_index) {
Some(v) => v,
None => {
return Err(ParseError::InvalidTag);
},
};
*index += new_index;
Ok((
Lexeme::OpenTag {
name,
attributes,
@ -163,37 +313,162 @@ fn parse_open_tag(tail: &str) -> Option<(Lexeme, &str)> {
))
}
fn parse_close_tag(tail: &str) -> Option<(Lexeme, &str)> {
let (_, tail) = parse_char(tail, '<')?;
let (_, tail) = parse_char(tail, '/')?;
let (name, tail) = parse_tag_name(tail)?;
let (_, tail) = parse_whitespace(tail);
let (_, tail) = parse_char(tail, '>')?;
Some((Lexeme::CloseTag { name }, tail))
fn parse_close_tag<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let (_, tail) =
parse_char(tail, '<', &mut new_index).ok_or(ParseError::Unknown)?;
let (_, tail) =
parse_char(tail, '/', &mut new_index).ok_or(ParseError::Unknown)?;
let (name, tail) =
parse_tag_name(tail, &mut new_index).ok_or(ParseError::InvalidTag)?;
let (_, tail) = parse_whitespace(tail, &mut new_index);
let (_, tail) =
parse_char(tail, '>', &mut new_index).ok_or(ParseError::InvalidTag)?;
*index += new_index;
Ok((Lexeme::CloseTag { name }, tail))
}
fn parse_text(tail: &str) -> Option<(Lexeme, &str)> {
let (txt, tail) = parse_while(tail, |c| c != '<');
fn parse_doctype<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let mut closure = || -> Option<(&str, &str)> {
let (_, tail) = parse_str(tail, "<!doctype", &mut new_index)?;
let (_, tail) = parse_whitespace_min(tail, 1, &mut new_index)?;
let (_, tail) = parse_str(tail, "html", &mut new_index)?;
let (_, tail) = parse_whitespace(tail, &mut new_index);
parse_char(tail, '>', &mut new_index)
};
let (_, tail) = closure().ok_or(ParseError::Unknown)?;
*index += new_index;
Ok((Lexeme::Doctype, tail))
}
fn parse_comment<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let (_, tail) =
parse_str(tail, "<!--", &mut new_index).ok_or(ParseError::Unknown)?;
let (_, tail) =
parse_until_str(tail, "-->", &mut new_index).ok_or(ParseError::Unknown)?;
*index += new_index;
Ok((Lexeme::Comment, tail))
}
fn parse_text<'a>(
tail: &'a str,
index: &mut usize,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
let mut new_index = 0;
let (txt, tail) = parse_while(tail, |c| c != '<', &mut new_index);
if txt.is_empty() {
None
Err(ParseError::Unknown)
} else {
Some((Lexeme::Content(txt), tail))
*index += new_index;
Ok((Lexeme::Text(txt), tail))
}
}
pub fn parse_html(mut tail: &str) -> Vec<Lexeme> {
let mut stack = vec![];
fn or_keep_error<'a>(
r: Result<(Lexeme<'a>, &'a str), ParseError>,
op: impl FnOnce() -> Result<(Lexeme<'a>, &'a str), ParseError>,
) -> Result<(Lexeme<'a>, &'a str), ParseError> {
match r {
Ok(val) => Ok(val),
Err(e) => op().map_err(|new_e| match e {
ParseError::Unknown => new_e,
_ => e,
}),
}
}
fn index_to_rc(input: &str, index: usize) -> (usize, usize) {
let (mut row, mut col) = (1, 1);
for c in input[0..index].chars() {
if c == '\n' {
row += 1;
col = 1;
} else {
col += 1;
}
}
(row, col)
}
pub fn parse_html(input: &str) -> trace::Result<Vec<Lexeme>> {
let mut tail = input;
let mut lexeme_stack = vec![];
let mut validation_stack = vec![];
let mut index = 0;
let err = |error: ParseError, index: usize| -> trace::Result<Vec<Lexeme>> {
let (row, col) = index_to_rc(input, index);
let e: trace::Error = error.into();
Err(e).ctx(format!("Starting at line {} character {}", row, col))
};
while !tail.is_empty() {
let (_, new_tail) = parse_whitespace(tail);
let (_, new_tail) = parse_whitespace(tail, &mut index);
if new_tail.is_empty() {
break;
}
let (lm, new_tail) = parse_open_tag(new_tail)
.or_else(|| parse_close_tag(new_tail))
.or_else(|| parse_text(new_tail))
.unwrap();
stack.push(lm);
let result = or_keep_error(parse_open_tag(new_tail, &mut index), || {
parse_close_tag(new_tail, &mut index)
});
let result = or_keep_error(result, || parse_text(new_tail, &mut index));
let result = or_keep_error(result, || parse_comment(new_tail, &mut index));
let (lm, new_tail) =
match or_keep_error(result, || parse_doctype(new_tail, &mut index)) {
Ok(v) => v,
Err(e) => {
return err(e, index);
},
};
// Validate that open and close tags match
match lm {
Lexeme::OpenTag { name, is_void, .. } => {
if !is_void {
validation_stack.push(name);
}
},
Lexeme::CloseTag { name } => {
if VOID_ELEMENTS.contains(&name) {
return err(ParseError::VoidClosingTag(name.into()).into(), index);
}
if let Some(top) = validation_stack.pop() {
if name != top {
return err(
ParseError::MismatchedClosing {
expected: top.into(),
found: name.into(),
},
index,
);
}
} else {
return err(ParseError::UnmatchedClose(name.into()), index);
}
},
Lexeme::Comment => {
tail = new_tail;
continue;
},
_ => {},
};
lexeme_stack.push(lm);
tail = new_tail;
}
stack
if let Some(top) = validation_stack.pop() {
let e: trace::Error = ParseError::UnmatchedOpen(top.into()).into();
return Err(e).ctx("At end of file");
}
Ok(lexeme_stack)
}

View file

@ -44,15 +44,26 @@ pub enum ErrorKind {
Unknown,
}
impl Display for Error {
impl Display for ErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:?} error\nReason:\n\t{}\nBacktrace:\n",
self.kind, self.reason
)?;
"{}",
match self {
ErrorKind::IO => "IO",
ErrorKind::Parsing => "PARSING",
ErrorKind::Compilation => "COMPILATION",
ErrorKind::Unknown => "UNKNOWN",
}
)
}
}
impl Display for Error {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "\n[{} ERROR] {}\nBacktrace:\n", self.kind, self.reason)?;
for s in self.backtrace.iter().rev() {
write!(f, "\t{}\n", s)?;
write!(f, "{}\n", s)?;
}
Ok(())
}
@ -129,9 +140,10 @@ where
S: Into<String>,
{
fn ctx(self, s: S) -> Result<T> {
match self {
Some(v) => Ok(v),
None => Err(Error::new(ErrorKind::Unknown, "Missing expected value")),
}
self.ok_or_else(|| Error {
kind: ErrorKind::Unknown,
reason: "Missing expected value".into(),
backtrace: vec![s.into()],
})
}
}

View file

@ -7,8 +7,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="test_files/main-ad0a5132b4027392.css">
<link rel="preload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01_bg.wasm" as="fetch" type="application/wasm"
crossorigin="">
<link rel="preload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01_bg.wasm" as="fetch" type="application/wasm" crossorigin="">
<link rel="modulepreload" href="https://logan-gatlin.com/web-2abb0afbb41a1a01.js">
<link
href="data:text/css,%3Ais(%5Bid*%3D'google_ads_iframe'%5D%2C%5Bid*%3D'taboola-'%5D%2C.taboolaHeight%2C.taboola-placeholder%2C%23credential_picker_container%2C%23credentials-picker-container%2C%23credential_picker_iframe%2C%5Bid*%3D'google-one-tap-iframe'%5D%2C%23google-one-tap-popup-container%2C.google-one-tap-modal-div%2C%23amp_floatingAdDiv%2C%23ez-content-blocker-container)%20%7Bdisplay%3Anone!important%3Bmin-height%3A0!important%3Bheight%3A0!important%3B%7D"