Files
zensical/python/zensical/extensions/search.py
T
2026-05-01 13:27:06 +02:00

374 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Copyright (c) 2025-2026 Zensical and contributors
# SPDX-License-Identifier: MIT
# All contributions are certified under the DCO
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
from html import escape
from html.parser import HTMLParser
from typing import Any
from zensical.markdown.extensions import ExtensionExt, MarkdownExt
from zensical.markdown.processors import PostprocessorExt
# -----------------------------------------------------------------------------
# Classes
# -----------------------------------------------------------------------------
class SearchProcessor(PostprocessorExt):
"""Post processor to extract searchable content from the rendered HTML."""
def __init__(self, md: MarkdownExt) -> None:
super().__init__(md)
self.data: list[dict[str, Any]] = []
def run(self, text: str) -> str:
"""Process the rendered HTML and extract text length."""
# Divide page content into sections
parser = Parser()
parser.feed(text)
parser.close()
# Extract data from sections that are not excluded
self.data = []
for section in parser.data:
if not section.is_excluded():
# Compute title and text
title = "".join(section.title).strip()
content = "".join(section.text).strip()
# Store data for external access
self.data.append(
{
"location": section.id,
"level": section.level,
"title": title,
"text": content,
"path": [],
"tags": [],
}
)
# Return the original HTML unchanged
return text
class SearchExtension(ExtensionExt):
"""Markdown extension for search indexing."""
def __init__(self, **kwargs: Any) -> None:
self.config = {"keep": [set(), "Set of HTML tags to keep in output"]}
super().__init__(**kwargs)
def extendMarkdown(self, md: MarkdownExt) -> None:
"""Register the PostProcessor with Markdown."""
processor = SearchProcessor(md)
md.postprocessors.register(processor, "search", 0)
def makeExtension(**kwargs: Any) -> SearchExtension:
"""Factory function for creating the extension."""
return SearchExtension(**kwargs)
# -----------------------------------------------------------------------------
# HTML element
class Element:
"""HTML element.
An element with attributes, essentially a small wrapper object for the
parser to access attributes in other callbacks than handle_starttag.
"""
# Initialize HTML element
def __init__(
self, tag: str, attrs: dict[str, str | None] | None = None
) -> None:
self.tag = tag
self.attrs = attrs or {}
# String representation
def __repr__(self):
return self.tag
# Support comparison (compare by tag only)
def __eq__(self, other: object) -> bool:
if isinstance(other, Element):
return self.tag == other.tag
return self.tag == other
# Support set operations
def __hash__(self):
return hash(self.tag)
# Check whether the element should be excluded
def is_excluded(self) -> bool:
return "data-search-exclude" in self.attrs
# -----------------------------------------------------------------------------
# HTML section
class Section:
"""HTML section.
A block of text with markup, preceded by a title (with markup), i.e., a
headline with a certain level (h1-h6). Internally used by the parser.
"""
# Initialize HTML section
def __init__(self, el: Element, level: int, depth: int = 0) -> None:
self.el = el
self.depth: int | float = depth
self.level = level
# Initialize section data
self.text: list[str] = []
self.title: list[str] = []
self.id: str | None = None
# String representation
def __repr__(self):
if self.id:
return f"{self.el.tag}#{self.id}"
return self.el.tag
# Check whether the section should be excluded
def is_excluded(self) -> bool:
return self.el.is_excluded()
# -----------------------------------------------------------------------------
# HTML parser
class Parser(HTMLParser):
"""Section divider.
This parser divides the given string of HTML into a list of sections, each
of which are preceded by a h1-h6 level heading. A white- and blacklist of
tags dictates which tags should be preserved as part of the index, and
which should be ignored in their entirety.
"""
# Initialize HTML parser
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
# Tags to skip
self.skip: set[str | Element] = {
"object", # Objects
"script", # Scripts
"style", # Styles
}
# Current context and section
self.context: list[Element] = []
self.section: Section | None = None
# All parsed sections
self.data: list[Section] = []
# Called at the start of every HTML tag
def handle_starttag(
self, tag: str, attrs: list[tuple[str, str | None]]
) -> None:
attrs_dict = dict(attrs)
# Ignore self-closing tags
el = Element(tag, attrs_dict)
if tag not in void:
self.context.append(el)
else:
return
# Handle heading
if tag in ([f"h{x}" for x in range(1, 7)]):
depth = len(self.context)
if "id" in attrs_dict:
# Ensure top-level section
if tag != "h1" and not self.data:
self.section = Section(Element("hx"), 1, depth)
self.data.append(self.section)
# Set identifier, if not first section
self.section = Section(el, int(tag[1:2]), depth)
if self.data:
self.section.id = attrs_dict["id"]
# Append section to list
self.data.append(self.section)
# Handle preface - ensure top-level section
if not self.section:
self.section = Section(Element("hx"), 1)
self.data.append(self.section)
# Handle special cases to skip
for key, value in attrs_dict.items():
# Skip block if explicitly excluded from search
if key == "data-search-exclude":
self.skip.add(el)
return
# Skip line numbers - see https://bit.ly/3GvubZx
if key == "class" and value == "linenodiv":
self.skip.add(el)
return
# Render opening tag if kept
if not self.skip.intersection(self.context) and tag in keep:
# Check whether we're inside the section title
data = self.section.text
if self.section.el in self.context:
data = self.section.title
# Append to section title or text
data.append(f"<{tag}>")
# Called at the end of every HTML tag
def handle_endtag(self, tag: str) -> None:
if not self.context or self.context[-1] != tag:
return
# Check whether we're exiting the current context, which happens when
# a headline is nested in another element. In that case, we close the
# current section, continuing to append data to the previous section,
# which could also be a nested section see https://bit.ly/3IxxIJZ
assert self.section is not None # noqa: S101
if self.section.depth > len(self.context):
for section in reversed(self.data):
if section.depth <= len(self.context):
# Set depth to infinity in order to denote that the current
# section is exited and must never be considered again.
self.section.depth = float("inf")
self.section = section
break
# Remove element from skip list
el = self.context.pop()
if el in self.skip:
if el.tag not in ["script", "style", "object"]:
self.skip.remove(el)
return
# Render closing tag if kept
if not self.skip.intersection(self.context) and tag in keep:
# Check whether we're inside the section title
data = self.section.text
if self.section.el in self.context:
data = self.section.title
# Search for corresponding opening tag
index = data.index(f"<{tag}>")
for i in range(index + 1, len(data)):
if not data[i].isspace():
index = len(data)
break
# Remove element if empty (or only whitespace)
if len(data) > index:
while len(data) > index:
data.pop()
# Append to section title or text
else:
data.append(f"</{tag}>")
# Called for the text contents of each tag
def handle_data(self, data: str) -> None:
if self.skip.intersection(self.context):
return
# Collapse whitespace in non-pre contexts
if "pre" not in self.context:
if not data.isspace():
data = data.replace("\n", " ")
else:
data = " "
# Handle preface - ensure top-level section
if not self.section:
self.section = Section(Element("hx"), 1)
self.data.append(self.section)
# Handle section headline
if self.section.el in self.context:
permalink = False
for el in self.context:
if el.tag == "a" and el.attrs.get("class") == "headerlink":
permalink = True
# Ignore permalinks
if not permalink:
self.section.title.append(escape(data, quote=False))
# Collapse adjacent whitespace
elif data.isspace():
if (
not self.section.text
or not self.section.text[-1].isspace()
or "pre" in self.context
):
self.section.text.append(data)
# Handle everything else
else:
self.section.text.append(escape(data, quote=False))
# -----------------------------------------------------------------------------
# Data
# -----------------------------------------------------------------------------
# Tags to keep
keep = {
"p",
"code",
"pre",
"li",
"ol",
"ul",
"sub",
"sup",
}
# Tags that are self-closing
void = {
"area",
"base",
"br",
"col",
"embed",
"hr",
"img",
"input",
"link",
"meta",
"param",
"source",
"track",
"wbr",
}