fix: relative links in raw HTML not correctly resolved (#258)

Signed-off-by: squidfunk <martin.donath@squidfunk.com>
2026-05-06 02:50:34 +00:00 · 2026-04-23 15:16:41 +02:00
parent 87abb2abef
commit 64f3f33b72
3 changed files with 165 additions and 66 deletions
@@ -38,7 +38,7 @@ if TYPE_CHECKING:
 # Constants
 # -----------------------------------------------------------------------------

-_RE = re.compile(r"<img\s[^>]*?>", re.IGNORECASE | re.DOTALL)
+_RE = re.compile(r"<img\s[^>]*?>", re.IGNORECASE)
 """Match images in stashed raw HTML blocks."""

 # -----------------------------------------------------------------------------
@@ -47,13 +47,13 @@ _RE = re.compile(r"<img\s[^>]*?>", re.IGNORECASE | re.DOTALL)


 class GlightboxTreeprocessor(Treeprocessor):
-    """Wraps image elements in anchor tags to enable GLightbox functionality."""
+    """Wraps image elements in anchor tags to integrate with GLightbox."""

    SKIP_CLASSES: frozenset[str] = frozenset(
        {"emojione", "twemoji", "gemoji", "off-glb"}
    )

-    def __init__(self, md: Markdown | None, config: dict[str, object]) -> None:
+    def __init__(self, md: Markdown, config: dict[str, object]):
        super().__init__(md)
        self.config = config

@@ -69,7 +69,7 @@ class GlightboxTreeprocessor(Treeprocessor):
                self._wrap_with_anchor(img, root)

    def _should_skip(self, img: Element, skip_classes: frozenset[str]) -> bool:
-        """Return if this image should be excluded from wrapping."""
+        """Determine if this image should be excluded from wrapping."""
        classes = set(img.get("class", "").split())
        if classes & skip_classes:
            return True
@@ -174,7 +174,7 @@ class GlightboxPostprocessor(Postprocessor):
    parse and modify the HTML with an actual parser.
    """

-    def __init__(self, md: Markdown | None, config: dict[str, object]) -> None:
+    def __init__(self, md: Markdown, config: dict[str, object]):
        super().__init__(md)
        self._processor = GlightboxTreeprocessor(md, config)
        self._processed: set[int] = set()
@@ -189,7 +189,7 @@ class GlightboxPostprocessor(Postprocessor):
        for i, raw in enumerate(self.md.htmlStash.rawHtmlBlocks):
            if i not in self._processed:
                self.md.htmlStash.rawHtmlBlocks[i] = _RE.sub(
-                    self._maybe_wrap, raw
+                    self._maybe_process, raw
                )
                self._processed.add(i)

@@ -197,7 +197,7 @@ class GlightboxPostprocessor(Postprocessor):
        # blocks, which will later be reinstated by the raw HTML postprocessor
        return text

-    def _maybe_wrap(self, m: re.Match[str]) -> str:
+    def _maybe_process(self, m: re.Match[str]) -> str:
        """Wrap a single matched image, delegating to the treeprocessor."""
        raw = m.group(0)
        try:
@@ -23,41 +23,46 @@

 from __future__ import annotations

+import re
 from pathlib import PurePosixPath
 from typing import TYPE_CHECKING
 from urllib.parse import urlparse

-from markdown import Extension, Markdown
+from markdown.extensions import Extension
+from markdown.postprocessors import Postprocessor
 from markdown.treeprocessors import Treeprocessor
 from markdown.util import AMP_SUBSTITUTE

 if TYPE_CHECKING:
    from xml.etree.ElementTree import Element

+    from markdown import Markdown
+
+# -----------------------------------------------------------------------------
+# Constants
+# -----------------------------------------------------------------------------
+
+_RE = re.compile(
+    r'(?:href|src)=(?P<quote>["\'])(?P<value>[^"\']+)(?P=quote)',
+    re.IGNORECASE,
+)
+"""Match `href` and `src` attribute values in stashed raw HTML blocks."""

 # -----------------------------------------------------------------------------
 # Classes
 # -----------------------------------------------------------------------------


-class LinksProcessor(Treeprocessor):
-    """Tree processor to replace links in Markdown with URLs.
-
-    Note that we view this as a bandaid until we can do processing on proper
-    HTML ASTs in Rust. In the meantime, we just replace them as we find them.
-    This processor will replace links to other Markdown files, as well as
-    adjust asset links if directory URLs are used.
-    """
+class LinksTreeprocessor(Treeprocessor):
+    """Rewrites relative links."""

    def __init__(self, md: Markdown, path: str, use_directory_urls: bool):
        super().__init__(md)
-        self.path = path  # Current page
+        self.path = path
        self.use_directory_urls = use_directory_urls

    def run(self, root: Element) -> None:
-        # Now, we determine whether the current page is an index page, as we
-        # must apply slightly different handling in case of directory URLs
-        current_is_index = get_name(self.path) in ("index.md", "README.md")
+        """Walk the element tree and rewrites `href` and `src` attributes."""
        for el in root.iter():
            # In case the element has a `href` or `src` attribute, we parse it
            # as an URL, so we can analyze and alter its path
@@ -65,65 +70,88 @@ class LinksProcessor(Treeprocessor):
            if not key:
                continue

-            # Extract value - Python Markdown does some weird stuff where it
-            # replaces mailto: links with double encoded entities. MkDocs just
-            # skips if it detects that, so we do the same.
-            value = el.get(key, "")
-            if AMP_SUBSTITUTE in value:
-                continue
+            # Rewrite relative links, leaving absolute URLs unchanged
+            if url := _rewrite_url(
+                el.get(key, ""), self.path, self.use_directory_urls
+            ):
+                el.set(key, url)

-            # Parse URL and skip everything that is not a relative link
-            url = urlparse(value)
-            if url.scheme or url.netloc or url.path.startswith("/"):
-                continue

-            # Leave anchors that go to the same page as they are
-            if not url.path and url.fragment:
-                continue
+class LinksPostprocessor(Postprocessor):
+    """Rewrites relative links in stashed raw HTML blocks.

-            # Now, adjust relative links to Markdown files
-            path = url.path
-            if path.endswith(".md"):
-                path = path.removesuffix(".md") + ".html"
-                name = get_name(path)
-                if self.use_directory_urls:
-                    if name in ("index.html", "README.html"):
-                        path = path.removesuffix(name)
-                    elif path.endswith(".html"):
-                        path = path.removesuffix(".html") + "/"
-                elif name == "README.html":
-                    path = path.removesuffix("README.html") + "index.html"
+    This postprocessor complements the :class:`LinksTreeprocessor` by applying
+    the same URL rewriting logic to raw HTML blocks that Python-Markdown stashes
+    before tree processing and reinstates afterward. This ensures that links
+    inside raw HTML are handled consistently as well.
+    """

-            # If the current page is not an index page, and we should render
-            # directory URLs, we need to prepend a "../" to all links
-            if not current_is_index and self.use_directory_urls:
-                path = f"../{path}"
+    def __init__(self, md: Markdown, path: str, use_directory_urls: bool):
+        super().__init__(md)
+        self._path = path
+        self._use_directory_urls = use_directory_urls
+        self._processed: set[int] = set()

-            # Reassemble URL and update link
-            el.set(key, url._replace(path=path).geturl())
+    def run(self, text: str) -> str:
+        """Rewrite `href` and `src` attributes of stashed HTML blocks."""
+        for i, raw in enumerate(self.md.htmlStash.rawHtmlBlocks):
+            if i not in self._processed:
+                self.md.htmlStash.rawHtmlBlocks[i] = _RE.sub(
+                    self._maybe_process, raw
+                )
+                self._processed.add(i)
+
+        # Return text unmodified, as we only need to modify the stashed raw HTML
+        # blocks, which will later be reinstated by the raw HTML postprocessor
+        return text
+
+    def _maybe_process(self, m: re.Match[str]) -> str:
+        """Rewrite a single matched `href` or `src` value."""
+        value = m.group("value")
+
+        # Rewrite relative links, leaving absolute URLs unchanged
+        updated = _rewrite_url(value, self._path, self._use_directory_urls)
+        if updated is None:
+            return m.group(0)
+
+        # Reconstruct the attribute with the original quote style preserved
+        q = m.group("quote")
+        attr = m.group(0).split("=")[0]
+        return f"{attr}={q}{updated}{q}"


 # -----------------------------------------------------------------------------


 class LinksExtension(Extension):
-    """A Markdown extension to resolve links to other Markdown files."""
+    """Markdown extension to rewrite relative links to other files.

-    def __init__(self, path: str, use_directory_urls: bool):
+    Registers both a treeprocessor for links in the normal Markdown flow and
+    a postprocessor for links inside stashed raw HTML blocks, so that all
+    relative URLs are rewritten consistently regardless of how they appear in
+    the source document.
+    """
+
+    def __init__(self, path: str, use_directory_urls: bool) -> None:
        """Initialize the extension."""
-        self.path = path  # Current page
+        self.path = path
        self.use_directory_urls = use_directory_urls

    def extendMarkdown(self, md: Markdown) -> None:  # noqa: N802
        """Register Markdown extension."""
        md.registerExtension(self)

-        # Create and register treeprocessor - we use the same priority as the
-        # `relpath` treeprocessor, the latter of which is guaranteed to run
-        # after our treeprocessor, so we can check the original Markdown URIs
-        # before they are resolved to URLs.
-        processor = LinksProcessor(md, self.path, self.use_directory_urls)
-        md.treeprocessors.register(processor, "zrelpath", 0)
+        # Register treeprocessor
+        treeprocessor = LinksTreeprocessor(
+            md, self.path, self.use_directory_urls
+        )
+        md.treeprocessors.register(treeprocessor, "zrelpath", 0)
+
+        # Register postprocessor before `raw_html` processor
+        postprocessor = LinksPostprocessor(
+            md, self.path, self.use_directory_urls
+        )
+        md.postprocessors.register(postprocessor, "zrelpath_raw", 35)


 # -----------------------------------------------------------------------------
@@ -131,7 +159,78 @@ class LinksExtension(Extension):
 # -----------------------------------------------------------------------------


-def get_name(path: str) -> str:
-    """Get the name of a file from a given path."""
-    pure_path = PurePosixPath(path)
-    return pure_path.name
+def _get_name(path: str) -> str:
+    """Return the filename component of a POSIX-style path."""
+    path = PurePosixPath(path)
+    return path.name
+
+
+def _is_relative(value: str) -> bool:
+    """Determine whether a URL string is a relative link."""
+    if AMP_SUBSTITUTE in value:
+        return False
+
+    # Absolute URLs (e.g. `https://example.com`) and protocol-relative URLs
+    url = urlparse(value)
+    if url.scheme or url.netloc or url.path.startswith("/"):
+        return False
+
+    # Anchor-only references (e.g. `#section`) should not be rewritten, as they
+    # point to a section within the same page rather than a different page
+    return not (not url.path and url.fragment)
+
+
+def _md_path_to_html(path: str, use_directory_urls: bool) -> str:
+    """Convert a relative `.md` path to its final HTML form."""
+    if not path.endswith(".md"):
+        return path
+
+    # Convert the `.md` extension to `.html` and extract the file name
+    path = path.removesuffix(".md") + ".html"
+    name = _get_name(path)
+
+    # When directory URLs are enabled, `index.html` and `README.html` collapse
+    # to their parent directory, while all other pages become directories with
+    # a trailing slash. When directory URLs are disabled, `README.html` is
+    # served as `index.html`, while all other pages remain unchanged.
+    if use_directory_urls:
+        if name in ("index.html", "README.html"):
+            return path.removesuffix(name)
+
+        # All other pages become directories (trailing slash)
+        return path.removesuffix(".html") + "/"
+
+    # README.html is served as index.html in flat URL mode
+    if name == "README.html":
+        return path.removesuffix("README.html") + "index.html"
+
+    # No change needed
+    return path
+
+
+def _apply_directory_prefix(
+    value: str, path: str, use_directory_urls: bool
+) -> str:
+    """Prepend `../` for non-index pages when directory URLs are enabled."""
+    is_index = _get_name(path) in ("index.md", "README.md")
+    if not is_index and use_directory_urls:
+        return f"../{value}"
+
+    # No change needed
+    return value
+
+
+def _rewrite_url(value: str, path: str, use_directory_urls: bool) -> str | None:
+    """Rewrite a relative URL."""
+    if not _is_relative(value):
+        return None
+
+    # Parse URL, so we can analyze and alter its path while preserving other
+    # components like query parameters and fragments
+    url = urlparse(value)
+
+    # Rewrite the path component, noting that the URL may be relative to the
+    # current page, so we need to adjust it accordingly
+    value = _md_path_to_html(url.path, use_directory_urls)
+    value = _apply_directory_prefix(value, path, use_directory_urls)
+    return url._replace(path=value).geturl()
@@ -30,7 +30,7 @@ from urllib.parse import urlparse
 from markdown import Extension, Markdown
 from markdown.treeprocessors import Treeprocessor

-from zensical.extensions.links import LinksProcessor
+from zensical.extensions.links import LinksTreeprocessor
 from zensical.extensions.utilities.filter import Filter

 if TYPE_CHECKING:
@@ -67,7 +67,7 @@ class PreviewProcessor(Treeprocessor):
        # changes, we would need to wrap this extension in a plugin, but for
        # the time being we are sneaky and will probably get away with it.
        processor = self.md.treeprocessors[at]
-        if not isinstance(processor, LinksProcessor):
+        if not isinstance(processor, LinksTreeprocessor):
            raise TypeError("Links processor not registered")

        # Normalize configurations