diff --git a/Cargo.lock b/Cargo.lock index 41907e0..681eca9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,6 +777,18 @@ dependencies = [ "syn", ] +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.13" @@ -1419,6 +1431,7 @@ dependencies = [ "minijinja-contrib", "mio", "pyo3", + "regex", "serde", "serde_json", "thiserror 2.0.17", diff --git a/Cargo.toml b/Cargo.toml index 2359f65..aec8ce1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ minijinja = "2.12" minijinja-contrib = "2.12" notify = "8.2" percent-encoding = "2.3" +regex = "1.12" sha1_smol = "1.0" slab = "0.4" serde = "1.0" diff --git a/crates/zensical/Cargo.toml b/crates/zensical/Cargo.toml index 8daafc3..b712e55 100644 --- a/crates/zensical/Cargo.toml +++ b/crates/zensical/Cargo.toml @@ -54,6 +54,7 @@ minijinja = { workspace = true, features = [ minijinja-contrib = { workspace = true, features = ["html_entities"] } mio = { workspace = true, features = ["net", "os-poll"] } pyo3.workspace = true +regex.workspace = true serde = { workspace = true, features = ["derive", "rc"] } serde_json.workspace = true thiserror.workspace = true diff --git a/crates/zensical/src/structure/markdown.rs b/crates/zensical/src/structure/markdown.rs index 77dfd24..526539d 100644 --- a/crates/zensical/src/structure/markdown.rs +++ b/crates/zensical/src/structure/markdown.rs @@ -39,6 +39,10 @@ use crate::structure::nav::to_title; use crate::structure::search::SearchItem; use crate::structure::toc::Section; +mod autorefs; + +pub use autorefs::Autorefs; + // ---------------------------------------------------------------------------- // Structs // ---------------------------------------------------------------------------- @@ -57,6 +61,8 @@ pub struct Markdown { pub title: String, /// Table of contents. pub toc: Vec
, + /// Autorefs (mkdocstrings). + pub autorefs: Option, } // ---------------------------------------------------------------------------- @@ -66,12 +72,14 @@ pub struct Markdown { impl Markdown { /// Renders Markdown using Python Markdown. #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))] - pub fn new(id: &Id, content: String) -> impl IntoReport { + pub fn new( + id: &Id, url: String, content: String, + ) -> impl IntoReport { let id = id.clone(); Python::attach(|py| { let module = py.import("zensical.markdown")?; module - .call_method1("render", (content, id.location()))? + .call_method1("render", (content, id.location(), url))? .extract::() }) .map_err(|err: PyErr| Error::from(Box::new(err) as Box<_>)) @@ -81,6 +89,7 @@ impl Markdown { content: markdown.content, search: markdown.search, toc: markdown.toc, + autorefs: markdown.autorefs, }) } } diff --git a/crates/zensical/src/structure/markdown/autorefs.rs b/crates/zensical/src/structure/markdown/autorefs.rs new file mode 100644 index 0000000..634115f --- /dev/null +++ b/crates/zensical/src/structure/markdown/autorefs.rs @@ -0,0 +1,630 @@ +// Copyright (c) 2025 Zensical and contributors + +// SPDX-License-Identifier: MIT +// Third-party contributions licensed under DCO + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +// IN THE SOFTWARE. + +// ---------------------------------------------------------------------------- + +//! Autorefs (mkdocstrings). + +use ahash::HashMap; +use pyo3::FromPyObject; +use regex::{Captures, Regex}; +use serde::{Deserialize, Serialize}; +use std::path::Path; +use std::string::ToString; +use std::sync::LazyLock; +use zrx::path::PathExt; + +// ---------------------------------------------------------------------------- +// Constants +// ---------------------------------------------------------------------------- + +/// Autoref regex. +static AUTOREF_RE: LazyLock = LazyLock::new(|| { + Regex::new(r".*?)>(?P.*?)</autoref>").unwrap() +}); + +/// Handled autoref attributes that should not be passed through to the output link. +const HANDLED_ATTRS: &[&str] = &[ + "identifier", + "optional", + "hover", + "class", + "domain", + "role", + "origin", + "filepath", + "lineno", + "slug", + "backlink-type", + "backlink-anchor", +]; + +// ---------------------------------------------------------------------------- +// Helper Functions +// ---------------------------------------------------------------------------- + +/// Escapes HTML special characters. +fn html_escape(text: &str) -> String { + text.replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) + .replace('\'', "'") +} + +/// Helper to check if a URL is relative to a base URL. +fn is_relative_to(url: &str, base: &str) -> bool { + // Remove fragments and query strings for directory comparison + let url_path = url + .split('#') + .next() + .unwrap_or(url) + .split('?') + .next() + .unwrap_or(url); + let base_path = base + .split('#') + .next() + .unwrap_or(base) + .split('?') + .next() + .unwrap_or(base); + + // Use Path::starts_with for proper path comparison + Path::new(url_path).starts_with(Path::new(base_path)) +} + +/// Gets the parent path of a URL. +fn parent_path(url: &str) -> Option<String> { + Path::new(url) + .parent() + .and_then(|p| p.to_str()) + .map(ToString::to_string) +} + +/// Resolves the closest URL from a list relative to from_url. +/// +/// We do that when multiple URLs are found for an identifier. +/// +/// By closest, we mean a combination of "relative to the current page" and "shortest distance from the current page". +/// +/// For example, if you link to identifier `hello` from page `foo/bar/`, +/// and the identifier is found in `foo/`, `foo/baz/` and `foo/bar/baz/qux/` pages, +/// autorefs will resolve to `foo/bar/baz/qux`, which is the only URL relative to `foo/bar/`. +/// +/// If multiple URLs are equally close, autorefs will resolve to the first of these equally close URLs. +/// If autorefs cannot find any URL that is close to the current page, it will log a warning and resolve to the first URL found. +/// +/// When false and multiple URLs are found for an identifier, autorefs will log a warning and resolve to the first URL. +fn resolve_closest_url( + from_url: &str, urls: &[String], _qualifier: &str, +) -> String { + let mut base_url = from_url.to_string(); + let candidates; + + loop { + let found: Vec<String> = urls + .iter() + .filter(|url| is_relative_to(url, &base_url)) + .cloned() + .collect(); + + if !found.is_empty() { + candidates = found; + break; + } + + match parent_path(&base_url) { + Some(parent) if !parent.is_empty() => { + base_url = parent; + } + _ => { + // @todo Log warning using qualifier + return urls[0].clone(); + } + } + } + + if candidates.len() == 1 { + candidates[0].clone() + } else { + // Find the URL with the fewest slashes + candidates + .into_iter() + .min_by_key(|url| url.matches('/').count()) + .unwrap() + } +} + +/// Computes a relative URL from from_url to to_url. +fn relative_url(from_url: &str, to_url: &str) -> String { + let from_path = Path::new(from_url); + + // Split URL and fragment for relative computation + let (to_path, to_fragment) = to_url + .split_once('#') + .map_or((Path::new(to_url), None), |(path, f)| { + (Path::new(path), Some(f)) + }); + + // Make target URL relative to page + let mut rel_path = to_path + .relative_to(from_path) + .to_string_lossy() + .replace('\\', "/"); + + // Add fragment back if present + if let Some(frag) = to_fragment { + // If the relative path is "." and we have a fragment, + // just return the fragment + if rel_path == "." { + return format!("#{frag}"); + } + // If `to_path` was empty (URL was just a fragment), + // add "/" before the fragment + if to_path.as_os_str().is_empty() { + rel_path.push('/'); + } + rel_path.push('#'); + rel_path.push_str(frag); + } + + rel_path +} + +/// Checks if a URL is relative (no scheme). +fn is_relative_url(url: &str) -> bool { + !(url.starts_with("http://") || url.starts_with("https://")) +} + +// ---------------------------------------------------------------------------- +// Structs +// ---------------------------------------------------------------------------- + +/// Autorefs (mkdocstrings). +/// +/// We use three URL maps, one for "primary" URLs, one for "secondary" URLs, +/// and one for "absolute" URLs. +/// +/// - A primary URL is an identifier that links to a specific anchor on a page. +/// - A secondary URL is an alias of an identifier that links to the same anchor as the identifier's primary URL. +/// Primary URLs with these aliases as identifiers may or may not be rendered later. +/// - An absolute URL is an identifier that links to an external resource. +/// These URLs are typically registered by mkdocstrings when loading object inventories. +/// +/// mkdocstrings registers a primary URL for each heading rendered in a page. +/// Then, for each alias of this heading's identifier, it registers a secondary URL. +/// +/// For example: +/// +/// - Object `a.b.c.d` has aliases `a.b.d` and `a.d` +/// - Object `a.b.c.d` is rendered. +/// - We register `a.b.c.d` -> page#a.b.c.d as primary +/// - We register `a.b.d` -> page#a.b.c.d as secondary +/// - We register `a.d` -> page#a.b.c.d as secondary +/// - Later, if `a.b.d` or `a.d` are rendered, we will register primary and secondary URLs the same way +/// - This way we are sure that each of `a.b.c.d`, `a.b.d` or `a.d` will link to their primary URL, if any, or their secondary URL, accordingly +/// +/// We need to keep track of whether an identifier is primary or secondary, +/// to give it precedence when resolving cross-references. +/// We wouldn't want to log a warning if there is a single primary URL and one or more secondary URLs, +/// instead we want to use the primary URL without any warning. +/// +/// - A single primary URL mapped to an identifer? Use it. +/// - Multiple primary URLs mapped to an identifier? Use the first one, or closest one if configured as such. +/// - No primary URL mapped to an identifier, but a secondary URL mapped? Use it. +/// - Multiple secondary URLs mapped to an identifier? Use the first one, or closest one if configured as such. +/// - No secondary URL mapped to an identifier? Try using absolute URLs +/// (typically registered by loading inventories in mkdocstrings). +#[derive( + Clone, Debug, Default, FromPyObject, Serialize, Deserialize, PartialEq, Eq, +)] +#[pyo3(from_item_all)] +pub struct Autorefs { + // Primary URLs. + pub primary: HashMap<String, Vec<String>>, + // Secondary URLs. + pub secondary: HashMap<String, Vec<String>>, + // Inventory URLs. + pub inventory: HashMap<String, String>, + // Titles. + pub titles: HashMap<String, String>, +} + +// ---------------------------------------------------------------------------- +// Implementations +// ---------------------------------------------------------------------------- + +impl Autorefs { + /// Creates a new, empty autorefs. + pub fn new() -> Self { + Self::default() + } + + /// Parses HTML attributes string into a HashMap. + /// + /// @todo Document that this is not the most resilient HTML parser + /// but since we control the autorefs elements, it's fine for now + fn parse_attributes(attrs_str: &str) -> HashMap<String, String> { + let mut attrs = HashMap::default(); + let mut chars = attrs_str.chars().peekable(); + + while let Some(ch) = chars.peek() { + // Skip whitespace + if ch.is_whitespace() { + chars.next(); + continue; + } + + // Parse attribute name + let mut name = String::new(); + while let Some(&ch) = chars.peek() { + if ch.is_whitespace() || ch == '=' { + break; + } + name.push(ch); + chars.next(); + } + + if name.is_empty() { + break; + } + + // Skip whitespace + while let Some(&ch) = chars.peek() { + if !ch.is_whitespace() { + break; + } + chars.next(); + } + + // Check for '=' + let has_value = chars.peek() == Some(&'='); + if has_value { + chars.next(); // consume '=' + + // Skip whitespace after '=' + while let Some(&ch) = chars.peek() { + if !ch.is_whitespace() { + break; + } + chars.next(); + } + + // Parse value + let value = if let Some("e) = chars.peek() { + if quote == '"' || quote == '\'' { + chars.next(); // consume opening quote + let mut val = String::new(); + for ch in chars.by_ref() { + if ch == quote { + break; // consume closing quote + } + val.push(ch); + } + val + } else { + // Unquoted value + let mut val = String::new(); + while let Some(&ch) = chars.peek() { + if ch.is_whitespace() { + break; + } + val.push(ch); + chars.next(); + } + val + } + } else { + String::new() + }; + + attrs.insert(name, value); + } else { + // Boolean attribute + attrs.insert(name, String::new()); + } + } + + attrs + } + + /// Resolves the URL for an item identifier (internal implementation). + fn get_url_from_id( + &self, identifier: &str, from_url: &str, resolve_closest: bool, + ) -> Result<String, String> { + // Try primary URLs first - usually, an object should not have multiple + // primary URLs, but if it does, resolve closest if requested. Primary + // URLs are the canonical locations objects are defined. If an object + // is re-exported, it should have a secondary URL instead. + if let Some(urls) = self.primary.get(identifier) { + if urls.len() > 1 && resolve_closest { + return Ok(resolve_closest_url(from_url, urls, "primary")); + // @todo Log warning about multiple URLs in production + } + return Ok(urls[0].clone()); + } + + // Try secondary URLs + if let Some(urls) = self.secondary.get(identifier) { + if urls.len() > 1 { + // Always resolve closest for secondary + // + // Downstream projects rendering aliases of objects + // imported from upstream ones will render these upstream + // objects' docstrings. These docstrings can contain + // cross-references to other upstream objects that are not + // rendered directly in downstream project's docs. + // + // If downstream project renders subclasses of upstream + // class, with inherited members, only primary URLs will be + // registered for the aliased/downstream identifiers, and + // only secondary URLs will be registered for the upstream + // identifiers. + // + // When trying to apply the cross-reference + // for the upstream docstring, autorefs will find only + // secondary URLs, and multiple ones. But the end user does + // not have control over this. It means we shouldn't log + // warnings when multiple secondary URLs are found, and + // always resolve to closest. + return Ok(resolve_closest_url(from_url, urls, "secondary")); + } + return Ok(urls[0].clone()); + } + + // Try inventory (absolute URLs) + if let Some(url) = self.inventory.get(identifier) { + return Ok(url.clone()); + } + + Err(format!("Identifier '{identifier}' not found")) + } + + /// Gets the URL for an item identifier. + fn get_url_and_title_from_id( + &self, identifier: &str, from_url: &str, + ) -> Result<(String, Option<String>), String> { + let mut url = self.get_url_from_id(identifier, from_url, true)?; + + // Get title using URL as key (not identifier) + let title = self.titles.get(&url).cloned(); + + // If from_url is provided and URL is relative, compute relative URL + if is_relative_url(&url) { + url = relative_url(from_url, &url); + } + + Ok((url, title)) + } + + /// Resolves the URL for the first matching identifier. + fn get_url_and_title_from_ids( + &self, identifiers: &[String], from_url: &str, + ) -> Result<(String, Option<String>), String> { + for identifier in identifiers { + if let Ok(result) = + self.get_url_and_title_from_id(identifier, from_url) + { + return Ok(result); + } + } + Err(format!( + "None of the identifiers {identifiers:?} were found", + )) + } + + /// Extends autorefs with another instance. + pub fn extend(&mut self, other: Autorefs) { + for (key, values) in other.primary { + self.primary.entry(key).or_default().extend(values); + } + for (key, values) in other.secondary { + self.secondary.entry(key).or_default().extend(values); + } + for (key, value) in other.inventory { + self.inventory.insert(key, value); + } + for (key, value) in other.titles { + self.titles.insert(key, value); + } + } + + /// Replaces autorefs in the given content. + pub fn replace_in(&self, content: String, from_url: &str) -> String { + let output = AUTOREF_RE.replace_all(&content, |captures: &Captures| { + let attrs_str = + captures.name("attrs").map_or("", |m| m.as_str()); + let title = + captures.name("title").map_or("", |m| m.as_str()); + + // Parse the HTML attributes + let attrs = Self::parse_attributes(attrs_str); + let identifier = + attrs.get("identifier").cloned().unwrap_or_default(); + let slug = attrs.get("slug").cloned().unwrap_or_default(); + let optional = attrs.contains_key("optional"); + + let identifiers = if slug.is_empty() { + vec![identifier.clone()] + } else { + vec![identifier.clone(), slug.clone()] + }; + + match self.get_url_and_title_from_ids(&identifiers, from_url) { + Ok((url, original_title)) => { + // Check if URL is external (not relative) + let external = !is_relative_url(&url); + + // Build CSS classes + let mut classes = vec![ + "autorefs".to_string(), + if external { + "autorefs-external".to_string() + } else { + "autorefs-internal".to_string() + }, + ]; + + // Add existing classes from attrs + if let Some(class_str) = attrs.get("class") { + classes.extend( + class_str + .split_whitespace() + .map(ToString::to_string), + ); + } + let class_attr = classes.join(" "); + + // Build remaining attributes (those not in the handled set) + let remaining_attrs: Vec<String> = attrs + .iter() + .filter(|(k, _)| !HANDLED_ATTRS.contains(&k.as_str())) + .map(|(k, v)| { + if v.is_empty() { + // Boolean attribute (no value) + k.clone() + } else { + // Attribute with value + format!("{k}=\"{v}\"") + } + }) + .collect(); + + let remaining = if remaining_attrs.is_empty() { + String::new() + } else { + format!(" {}", remaining_attrs.join(" ")) + }; + + // Build title attribute (link_titles is always true, strip_title_tags is always false) + let tooltip = if optional { + // For optional, we use identifier as fallback if no original_title + original_title.as_deref().unwrap_or(&identifier).to_string() + } else { + // For non-optional, use original_title or empty + original_title.as_deref().unwrap_or("").to_string() + }; + + let title_attr = if !tooltip.is_empty() && !format!("<code>{title}</code>").contains(&tooltip) { + format!(" title=\"{}\"", html_escape(&tooltip)) + } else { + String::new() + }; + + let escaped_url = html_escape(&url); + format!( + "<a class=\"{class_attr}\"{title_attr} href=\"{escaped_url}\"{remaining}>{title}</a>" + ) + } + Err(_) => { + if optional { + format!("<span title=\"{identifier}\">{title}</span>") + } else { + // @todo: unmapped.append((identifier, attrs.context)) + if title == identifier { + format!("[{identifier}][]") + } else if title == format!("<code>{identifier}</code>") + && slug.is_empty() + { + format!("[<code>{identifier}</code>][]") + } else { + format!("[{title}][{identifier}]") + } + } + } + } + }); + + output.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resolve_closest_url() { + let test_cases = vec![ + ("", vec!["x/#b", "#b"], "#b"), + ("a/b", vec!["x/#e", "a/c/#e", "a/d/#e"], "a/c/#e"), + ("a/b/", vec!["x/#e", "a/d/#e", "a/c/#e"], "a/d/#e"), + ("a/b", vec!["x/#e", "a/c/#e", "a/c/d/#e"], "a/c/#e"), + ("a/b/", vec!["x/#e", "a/c/d/#e", "a/c/#e"], "a/c/#e"), + ( + "a/b/c", + vec!["x/#e", "a/#e", "a/b/#e", "a/b/c/#e", "a/b/c/d/#e"], + "a/b/c/#e", + ), + ( + "a/b/c/", + vec!["x/#e", "a/#e", "a/b/#e", "a/b/c/d/#e", "a/b/c/#e"], + "a/b/c/#e", + ), + ("a", vec!["b/c/#d", "c/#d"], "b/c/#d"), + ("a/", vec!["c/#d", "b/c/#d"], "c/#d"), + ]; + + for (base, urls, expected) in test_cases { + let urls: Vec<String> = + urls.into_iter().map(String::from).collect(); + let result = resolve_closest_url(base, &urls, "test"); + assert_eq!(result, expected, "Failed for base: {base}"); + } + } + + #[test] + fn test_relative_url() { + let test_cases = vec![ + ("a/", "a#b", "#b"), + ("a/", "a/b#c", "b#c"), + ("a/b/", "a/b#c", "#c"), + ("a/b/", "a/c#d", "../c#d"), + ("a/b/", "a#c", "..#c"), + ("a/b/c/", "d#e", "../../../d#e"), + ("a/b/", "c/d/#e", "../../c/d/#e"), + ("a/index.html", "a/index.html#b", "#b"), + ("a/index.html", "a/b.html#c", "b.html#c"), + ("a/b.html", "a/b.html#c", "#c"), + ("a/b.html", "a/c.html#d", "c.html#d"), + ("a/b.html", "a/index.html#c", "index.html#c"), + ("a/b/c.html", "d.html#e", "../../d.html#e"), + ("a/b.html", "c/d.html#e", "../c/d.html#e"), + ("a/b/index.html", "a/b/c/d.html#e", "c/d.html#e"), + ("", "#x", "#x"), + ("a/", "#x", "../#x"), + ("a/b.html", "#x", "../#x"), + ("", "a/#x", "a/#x"), + ("", "a/b.html#x", "a/b.html#x"), + ]; + + for (current_url, to_url, expected_href) in test_cases { + let result = relative_url(current_url, to_url); + assert_eq!( + result, expected_href, + "Failed for relative_url('{current_url}', '{to_url}'), expected '{expected_href}' but got '{result}'" + ); + } + } +} diff --git a/crates/zensical/src/structure/nav.rs b/crates/zensical/src/structure/nav.rs index 6a0c78a..464d474 100644 --- a/crates/zensical/src/structure/nav.rs +++ b/crates/zensical/src/structure/nav.rs @@ -34,6 +34,8 @@ use zrx::id::Id; use zrx::scheduler::Value; use zrx::stream::value::Chunk; +use crate::structure::markdown::Autorefs; + use super::page::Page; mod item; @@ -59,6 +61,8 @@ pub struct Navigation { pub items: Vec<NavigationItem>, /// Homepage, if defined. pub homepage: Option<NavigationItem>, + /// Autorefs (mkdocstrings). + pub autorefs: Autorefs, /// Precomputed hash. pub hash: u64, } @@ -84,6 +88,14 @@ impl Navigation { }) .collect::<HashMap<_, _>>(); + // Consolidate autorefs from all pages + let mut autorefs = Autorefs::default(); + for page in pages.clone().into_values() { + if let Some(page_autorefs) = page.autorefs { + autorefs.extend(page_autorefs); + } + } + // Since a navigation structure is given, we just need to add titles and // icons where necessary and defined in page metadata let mut stack = vec![&mut items]; @@ -148,7 +160,12 @@ impl Navigation { }; // Return navigation - Self { items, homepage, hash } + Self { + items, + homepage, + autorefs, + hash, + } } /// Returns a copy of the navigation with the active item set based on the @@ -182,6 +199,7 @@ impl Navigation { Self { items, homepage: self.homepage.clone(), + autorefs: self.autorefs.clone(), hash: self.hash, } } @@ -285,6 +303,7 @@ impl From<Chunk<Id, Page>> for Navigation { // There can only be pages, no URLs, since we're auto-populating the // navigation from the files in the docs directory + let mut autorefs = Autorefs::default(); for page in pages { let location = page.id.location(); @@ -337,6 +356,11 @@ impl From<Chunk<Id, Page>> for Navigation { is_index: is_index(&file), active: false, }); + + // Consolidate autorefs + if let Some(value) = page.data.autorefs { + autorefs.extend(value); + } } // Precompute hash @@ -349,6 +373,7 @@ impl From<Chunk<Id, Page>> for Navigation { // Determine homepage and return navigation Self { homepage: items.iter().find(|item| item.is_index).cloned(), + autorefs, items, hash, } diff --git a/crates/zensical/src/structure/page.rs b/crates/zensical/src/structure/page.rs index 8896f1d..781a2dd 100644 --- a/crates/zensical/src/structure/page.rs +++ b/crates/zensical/src/structure/page.rs @@ -35,6 +35,7 @@ use zrx::id::Id; use zrx::scheduler::Value; use crate::config::Config; +use crate::structure::markdown::Autorefs; use crate::template::{Template, GENERATOR}; use super::dynamic::Dynamic; @@ -76,6 +77,8 @@ pub struct Page { pub toc: Vec<Section>, /// Search index. pub search: Vec<SearchItem>, + /// Autorefs (mkdocstrings). + pub autorefs: Option<Autorefs>, /// Ancestor pages. pub ancestors: Vec<NavigationItem>, /// Previous page. @@ -183,6 +186,7 @@ impl Page { content: markdown.content, toc: markdown.toc, search: markdown.search, + autorefs: markdown.autorefs, path: path.to_string_lossy().into_owned(), ancestors: Vec::new(), previous_page: None, @@ -212,7 +216,7 @@ impl Page { self.next_page = nav.next_page(self); // Create context and render template - template.render_with_context(context! { + let output = template.render_with_context(context! { generator => GENERATOR, nav => nav, base_url => config.get_base_url(&self.url), @@ -221,7 +225,10 @@ impl Page { config => config.project.clone(), tags => self.tags(), page => self, - }) + })?; + + // Replace autorefs, if any + Ok(nav.autorefs.replace_in(output, &self.url)) } /// Returns the tags of the page. diff --git a/crates/zensical/src/workflow.rs b/crates/zensical/src/workflow.rs index 15f1898..35fa468 100644 --- a/crates/zensical/src/workflow.rs +++ b/crates/zensical/src/workflow.rs @@ -26,7 +26,7 @@ //! Workflow definitions use std::hash::{DefaultHasher, Hash, Hasher}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::str::FromStr; use std::{fs, io}; use zrx::id::{Id, Matcher}; @@ -146,9 +146,50 @@ pub fn process_markdown( .map_concurrency( with_id(move |id: &Id, path: String| { let data = fs::read_to_string(path)?; - cached(&config, id, (config.hash, data), |(_, data)| { - Markdown::new(id, data) - }) + + // Compute URL using same logic as Page::new() + let site_dir = config.project.site_dir.clone(); + let use_directory_urls = config.project.use_directory_urls; + + let builder = id.to_builder().with_context(&site_dir); + let url_id = builder.clone().build().expect("invariant"); + + let mut url_path: PathBuf = + url_id.location().to_string().into(); + let is_index = url_path.ends_with("index.md") + || url_path.ends_with("README.md"); + + if url_path.ends_with("README.md") { + url_path.pop(); + url_path = url_path.join("index.md"); + } + + if !use_directory_urls || is_index { + url_path.set_extension("html"); + } else { + url_path.set_extension(""); + url_path.push("index.html"); + } + + let url_path = url_path.to_string_lossy().into_owned(); + let url_id = builder + .with_location(url_path.replace('\\', "/")) + .build() + .expect("invariant"); + + let url = url_id.as_uri().to_string(); + let url = if use_directory_urls { + url.trim_end_matches("index.html").to_string() + } else { + url + }; + + cached( + &config, + id, + (config.hash, data.clone(), url.clone()), + |(_, data, url)| Markdown::new(id, url, data), + ) .into_report() }), 1, diff --git a/python/zensical/markdown.py b/python/zensical/markdown.py index 4f8fd04..cec7396 100644 --- a/python/zensical/markdown.py +++ b/python/zensical/markdown.py @@ -28,7 +28,9 @@ from datetime import date, datetime from typing import TYPE_CHECKING, Any import yaml +from markdown import Extension as MarkdownExtension from markdown import Markdown +from markdown.preprocessors import Preprocessor from yaml import SafeLoader from zensical.config import get_config @@ -51,12 +53,46 @@ FRONT_MATTER_RE = re.compile( Regex pattern to extract front matter. """ + +# ---------------------------------------------------------------------------- +# Classes +# ---------------------------------------------------------------------------- + + +class CurrentPageData(Preprocessor): + """Preprocessor to store current page URL and path.""" + + def __init__(self, md: Markdown, url: str, path: str): + super().__init__(md) + self.url = url + self.path = path + + def run(self, lines: list[str]) -> list[str]: + return lines + + +class CurrentPageExtension(MarkdownExtension): + """Markdown extension to store current page URL and path.""" + + def __init__(self, url: str, path: str): + super().__init__() + self.url = url + self.path = path + + def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 + md.preprocessors.register( + CurrentPageData(md, self.url, self.path), + "zensical_current_page", + 0, + ) + + # ---------------------------------------------------------------------------- # Functions # ---------------------------------------------------------------------------- -def render(content: str, path: str) -> dict: +def render(content: str, path: str, url: str) -> dict: """Render Markdown and return HTML. This function returns rendered HTML as well as the table of contents and @@ -66,9 +102,14 @@ def render(content: str, path: str) -> dict: """ config = get_config() + # Insert current page extension at the beginning + extensions = [CurrentPageExtension(url, path)] + config[ + "markdown_extensions" + ] + # Initialize Markdown parser md = Markdown( - extensions=config["markdown_extensions"], + extensions=extensions, extension_configs=config["mdx_configs"], ) @@ -113,6 +154,24 @@ def render(content: str, path: str) -> dict: if meta.get("search", {}).get("exclude", False): search_processor.data = [] + # Extract URL map from extension if available + for extension in md.registeredExtensions: + if type(extension).__qualname__ == "MkdocstringsExtension": + autorefs = { + "primary": extension._autorefs._primary_url_map, # type: ignore[attr-defined] + "secondary": extension._autorefs._secondary_url_map, # type: ignore[attr-defined] + "inventory": extension._autorefs._abs_url_map, # type: ignore[attr-defined] + "titles": extension._autorefs._title_map, # type: ignore[attr-defined] + } + break + else: + autorefs = { + "primary": {}, + "secondary": {}, + "inventory": {}, + "titles": {}, + } + # Return Markdown with metadata return { "meta": meta, @@ -120,6 +179,7 @@ def render(content: str, path: str) -> dict: "search": search_processor.data, "title": "", "toc": [_convert_toc(item) for item in getattr(md, "toc_tokens", [])], + "autorefs": autorefs, }