diff --git a/python/zensical/extensions/glightbox.py b/python/zensical/extensions/glightbox.py index e6453f5..e3a9a7b 100644 --- a/python/zensical/extensions/glightbox.py +++ b/python/zensical/extensions/glightbox.py @@ -38,7 +38,7 @@ if TYPE_CHECKING: # Constants # ----------------------------------------------------------------------------- -_RE = re.compile(r"]*?>", re.IGNORECASE | re.DOTALL) +_RE = re.compile(r"]*?>", re.IGNORECASE) """Match images in stashed raw HTML blocks.""" # ----------------------------------------------------------------------------- @@ -47,13 +47,13 @@ _RE = re.compile(r"]*?>", re.IGNORECASE | re.DOTALL) class GlightboxTreeprocessor(Treeprocessor): - """Wraps image elements in anchor tags to enable GLightbox functionality.""" + """Wraps image elements in anchor tags to integrate with GLightbox.""" SKIP_CLASSES: frozenset[str] = frozenset( {"emojione", "twemoji", "gemoji", "off-glb"} ) - def __init__(self, md: Markdown | None, config: dict[str, object]) -> None: + def __init__(self, md: Markdown, config: dict[str, object]): super().__init__(md) self.config = config @@ -69,7 +69,7 @@ class GlightboxTreeprocessor(Treeprocessor): self._wrap_with_anchor(img, root) def _should_skip(self, img: Element, skip_classes: frozenset[str]) -> bool: - """Return if this image should be excluded from wrapping.""" + """Determine if this image should be excluded from wrapping.""" classes = set(img.get("class", "").split()) if classes & skip_classes: return True @@ -174,7 +174,7 @@ class GlightboxPostprocessor(Postprocessor): parse and modify the HTML with an actual parser. """ - def __init__(self, md: Markdown | None, config: dict[str, object]) -> None: + def __init__(self, md: Markdown, config: dict[str, object]): super().__init__(md) self._processor = GlightboxTreeprocessor(md, config) self._processed: set[int] = set() @@ -189,7 +189,7 @@ class GlightboxPostprocessor(Postprocessor): for i, raw in enumerate(self.md.htmlStash.rawHtmlBlocks): if i not in self._processed: self.md.htmlStash.rawHtmlBlocks[i] = _RE.sub( - self._maybe_wrap, raw + self._maybe_process, raw ) self._processed.add(i) @@ -197,7 +197,7 @@ class GlightboxPostprocessor(Postprocessor): # blocks, which will later be reinstated by the raw HTML postprocessor return text - def _maybe_wrap(self, m: re.Match[str]) -> str: + def _maybe_process(self, m: re.Match[str]) -> str: """Wrap a single matched image, delegating to the treeprocessor.""" raw = m.group(0) try: diff --git a/python/zensical/extensions/links.py b/python/zensical/extensions/links.py index 13e5e01..596d7b8 100644 --- a/python/zensical/extensions/links.py +++ b/python/zensical/extensions/links.py @@ -23,41 +23,46 @@ from __future__ import annotations +import re from pathlib import PurePosixPath from typing import TYPE_CHECKING from urllib.parse import urlparse -from markdown import Extension, Markdown +from markdown.extensions import Extension +from markdown.postprocessors import Postprocessor from markdown.treeprocessors import Treeprocessor from markdown.util import AMP_SUBSTITUTE if TYPE_CHECKING: from xml.etree.ElementTree import Element + from markdown import Markdown + +# ----------------------------------------------------------------------------- +# Constants +# ----------------------------------------------------------------------------- + +_RE = re.compile( + r'(?:href|src)=(?P["\'])(?P[^"\']+)(?P=quote)', + re.IGNORECASE, +) +"""Match `href` and `src` attribute values in stashed raw HTML blocks.""" # ----------------------------------------------------------------------------- # Classes # ----------------------------------------------------------------------------- -class LinksProcessor(Treeprocessor): - """Tree processor to replace links in Markdown with URLs. - - Note that we view this as a bandaid until we can do processing on proper - HTML ASTs in Rust. In the meantime, we just replace them as we find them. - This processor will replace links to other Markdown files, as well as - adjust asset links if directory URLs are used. - """ +class LinksTreeprocessor(Treeprocessor): + """Rewrites relative links.""" def __init__(self, md: Markdown, path: str, use_directory_urls: bool): super().__init__(md) - self.path = path # Current page + self.path = path self.use_directory_urls = use_directory_urls def run(self, root: Element) -> None: - # Now, we determine whether the current page is an index page, as we - # must apply slightly different handling in case of directory URLs - current_is_index = get_name(self.path) in ("index.md", "README.md") + """Walk the element tree and rewrites `href` and `src` attributes.""" for el in root.iter(): # In case the element has a `href` or `src` attribute, we parse it # as an URL, so we can analyze and alter its path @@ -65,65 +70,88 @@ class LinksProcessor(Treeprocessor): if not key: continue - # Extract value - Python Markdown does some weird stuff where it - # replaces mailto: links with double encoded entities. MkDocs just - # skips if it detects that, so we do the same. - value = el.get(key, "") - if AMP_SUBSTITUTE in value: - continue + # Rewrite relative links, leaving absolute URLs unchanged + if url := _rewrite_url( + el.get(key, ""), self.path, self.use_directory_urls + ): + el.set(key, url) - # Parse URL and skip everything that is not a relative link - url = urlparse(value) - if url.scheme or url.netloc or url.path.startswith("/"): - continue - # Leave anchors that go to the same page as they are - if not url.path and url.fragment: - continue +class LinksPostprocessor(Postprocessor): + """Rewrites relative links in stashed raw HTML blocks. - # Now, adjust relative links to Markdown files - path = url.path - if path.endswith(".md"): - path = path.removesuffix(".md") + ".html" - name = get_name(path) - if self.use_directory_urls: - if name in ("index.html", "README.html"): - path = path.removesuffix(name) - elif path.endswith(".html"): - path = path.removesuffix(".html") + "/" - elif name == "README.html": - path = path.removesuffix("README.html") + "index.html" + This postprocessor complements the :class:`LinksTreeprocessor` by applying + the same URL rewriting logic to raw HTML blocks that Python-Markdown stashes + before tree processing and reinstates afterward. This ensures that links + inside raw HTML are handled consistently as well. + """ - # If the current page is not an index page, and we should render - # directory URLs, we need to prepend a "../" to all links - if not current_is_index and self.use_directory_urls: - path = f"../{path}" + def __init__(self, md: Markdown, path: str, use_directory_urls: bool): + super().__init__(md) + self._path = path + self._use_directory_urls = use_directory_urls + self._processed: set[int] = set() - # Reassemble URL and update link - el.set(key, url._replace(path=path).geturl()) + def run(self, text: str) -> str: + """Rewrite `href` and `src` attributes of stashed HTML blocks.""" + for i, raw in enumerate(self.md.htmlStash.rawHtmlBlocks): + if i not in self._processed: + self.md.htmlStash.rawHtmlBlocks[i] = _RE.sub( + self._maybe_process, raw + ) + self._processed.add(i) + + # Return text unmodified, as we only need to modify the stashed raw HTML + # blocks, which will later be reinstated by the raw HTML postprocessor + return text + + def _maybe_process(self, m: re.Match[str]) -> str: + """Rewrite a single matched `href` or `src` value.""" + value = m.group("value") + + # Rewrite relative links, leaving absolute URLs unchanged + updated = _rewrite_url(value, self._path, self._use_directory_urls) + if updated is None: + return m.group(0) + + # Reconstruct the attribute with the original quote style preserved + q = m.group("quote") + attr = m.group(0).split("=")[0] + return f"{attr}={q}{updated}{q}" # ----------------------------------------------------------------------------- class LinksExtension(Extension): - """A Markdown extension to resolve links to other Markdown files.""" + """Markdown extension to rewrite relative links to other files. - def __init__(self, path: str, use_directory_urls: bool): + Registers both a treeprocessor for links in the normal Markdown flow and + a postprocessor for links inside stashed raw HTML blocks, so that all + relative URLs are rewritten consistently regardless of how they appear in + the source document. + """ + + def __init__(self, path: str, use_directory_urls: bool) -> None: """Initialize the extension.""" - self.path = path # Current page + self.path = path self.use_directory_urls = use_directory_urls def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 """Register Markdown extension.""" md.registerExtension(self) - # Create and register treeprocessor - we use the same priority as the - # `relpath` treeprocessor, the latter of which is guaranteed to run - # after our treeprocessor, so we can check the original Markdown URIs - # before they are resolved to URLs. - processor = LinksProcessor(md, self.path, self.use_directory_urls) - md.treeprocessors.register(processor, "zrelpath", 0) + # Register treeprocessor + treeprocessor = LinksTreeprocessor( + md, self.path, self.use_directory_urls + ) + md.treeprocessors.register(treeprocessor, "zrelpath", 0) + + # Register postprocessor before `raw_html` processor + postprocessor = LinksPostprocessor( + md, self.path, self.use_directory_urls + ) + md.postprocessors.register(postprocessor, "zrelpath_raw", 35) # ----------------------------------------------------------------------------- @@ -131,7 +159,78 @@ class LinksExtension(Extension): # ----------------------------------------------------------------------------- -def get_name(path: str) -> str: - """Get the name of a file from a given path.""" - pure_path = PurePosixPath(path) - return pure_path.name +def _get_name(path: str) -> str: + """Return the filename component of a POSIX-style path.""" + path = PurePosixPath(path) + return path.name + + +def _is_relative(value: str) -> bool: + """Determine whether a URL string is a relative link.""" + if AMP_SUBSTITUTE in value: + return False + + # Absolute URLs (e.g. `https://example.com`) and protocol-relative URLs + url = urlparse(value) + if url.scheme or url.netloc or url.path.startswith("/"): + return False + + # Anchor-only references (e.g. `#section`) should not be rewritten, as they + # point to a section within the same page rather than a different page + return not (not url.path and url.fragment) + + +def _md_path_to_html(path: str, use_directory_urls: bool) -> str: + """Convert a relative `.md` path to its final HTML form.""" + if not path.endswith(".md"): + return path + + # Convert the `.md` extension to `.html` and extract the file name + path = path.removesuffix(".md") + ".html" + name = _get_name(path) + + # When directory URLs are enabled, `index.html` and `README.html` collapse + # to their parent directory, while all other pages become directories with + # a trailing slash. When directory URLs are disabled, `README.html` is + # served as `index.html`, while all other pages remain unchanged. + if use_directory_urls: + if name in ("index.html", "README.html"): + return path.removesuffix(name) + + # All other pages become directories (trailing slash) + return path.removesuffix(".html") + "/" + + # README.html is served as index.html in flat URL mode + if name == "README.html": + return path.removesuffix("README.html") + "index.html" + + # No change needed + return path + + +def _apply_directory_prefix( + value: str, path: str, use_directory_urls: bool +) -> str: + """Prepend `../` for non-index pages when directory URLs are enabled.""" + is_index = _get_name(path) in ("index.md", "README.md") + if not is_index and use_directory_urls: + return f"../{value}" + + # No change needed + return value + + +def _rewrite_url(value: str, path: str, use_directory_urls: bool) -> str | None: + """Rewrite a relative URL.""" + if not _is_relative(value): + return None + + # Parse URL, so we can analyze and alter its path while preserving other + # components like query parameters and fragments + url = urlparse(value) + + # Rewrite the path component, noting that the URL may be relative to the + # current page, so we need to adjust it accordingly + value = _md_path_to_html(url.path, use_directory_urls) + value = _apply_directory_prefix(value, path, use_directory_urls) + return url._replace(path=value).geturl() diff --git a/python/zensical/extensions/preview.py b/python/zensical/extensions/preview.py index eb9ac10..5b415de 100644 --- a/python/zensical/extensions/preview.py +++ b/python/zensical/extensions/preview.py @@ -30,7 +30,7 @@ from urllib.parse import urlparse from markdown import Extension, Markdown from markdown.treeprocessors import Treeprocessor -from zensical.extensions.links import LinksProcessor +from zensical.extensions.links import LinksTreeprocessor from zensical.extensions.utilities.filter import Filter if TYPE_CHECKING: @@ -67,7 +67,7 @@ class PreviewProcessor(Treeprocessor): # changes, we would need to wrap this extension in a plugin, but for # the time being we are sneaky and will probably get away with it. processor = self.md.treeprocessors[at] - if not isinstance(processor, LinksProcessor): + if not isinstance(processor, LinksTreeprocessor): raise TypeError("Links processor not registered") # Normalize configurations