diff --git a/.github/workflows/test-only.yml b/.github/workflows/test-only.yml index 04958b7e..246d254c 100644 --- a/.github/workflows/test-only.yml +++ b/.github/workflows/test-only.yml @@ -44,10 +44,60 @@ jobs: exit 1 fi + lint-template-i18n: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Check for fragmented gettext calls in templates + run: | + python3 << 'PYEOF' + import re, sys + from pathlib import Path + + # Detects adjacent {{ _(...) }} calls on the same line separated only by HTML + # tags, whitespace, or non-translating Jinja2 variables — the anti-pattern of + # splitting a single sentence across multiple msgids. + # See https://github.com/dgtlmoon/changedetection.io/issues/4074 for background. + # + # The correct fix is to consolidate fragments into one entire-sentence msgid, + # injecting dynamic values via %(name)s kwargs — per the GNU gettext manual + # sections "Entire sentences" and "No string concatenation". See PR #4076 for + # worked examples of each consolidation pattern. + # + # BASELINE: this limit reflects pre-existing violations present when this check + # was introduced. It must only ever go DOWN. Each time you fix a template, lower + # the limit by the number of lines fixed so the improvement is locked in. + # When the count reaches 0, replace the baseline check with a hard sys.exit(1). + BASELINE_LIMIT = 44 + + FRAGMENT_RE = re.compile( + r'\{\{[^{}]*\b_\s*\([^)]*\)[^{}]*\}\}' + r'(?:\s*(?:<[^>]+>|\{\{(?![^}]*\b_\s*\()[^}]*\}\})\s*)+' + r'\{\{[^{}]*\b_\s*\([^)]*\)[^{}]*\}\}' + ) + + violations = [] + for f in sorted(Path('changedetectionio').rglob('*.html')): + for lineno, line in enumerate(f.read_text().splitlines(), 1): + if FRAGMENT_RE.search(line): + violations.append(f"{f}:{lineno}: {line.strip()[:120]}") + + count = len(violations) + print(f"Fragmented i18n calls found: {count} (limit: {BASELINE_LIMIT})") + for v in violations: + print(v) + + if count > BASELINE_LIMIT: + print(f"\nERROR: {count} fragmented gettext calls exceed the baseline of {BASELINE_LIMIT}.") + print("Consolidate adjacent _() calls into a single entire-sentence msgid.") + print("See https://github.com/dgtlmoon/changedetection.io/issues/4074 for patterns.") + sys.exit(1) + PYEOF + test-application-3-10: # Only run on push to master (including PR merges) if: github.event_name == 'push' && github.ref == 'refs/heads/master' - needs: [lint-code, lint-translations] + needs: [lint-code, lint-translations, lint-template-i18n] uses: ./.github/workflows/test-stack-reusable-workflow.yml with: python-version: '3.10' @@ -55,7 +105,7 @@ jobs: test-application-3-11: # Always run - needs: [lint-code, lint-translations] + needs: [lint-code, lint-translations, lint-template-i18n] uses: ./.github/workflows/test-stack-reusable-workflow.yml with: python-version: '3.11' @@ -63,7 +113,7 @@ jobs: test-application-3-12: # Only run on push to master (including PR merges) if: github.event_name == 'push' && github.ref == 'refs/heads/master' - needs: [lint-code, lint-translations] + needs: [lint-code, lint-translations, lint-template-i18n] uses: ./.github/workflows/test-stack-reusable-workflow.yml with: python-version: '3.12' @@ -72,7 +122,7 @@ jobs: test-application-3-13: # Only run on push to master (including PR merges) if: github.event_name == 'push' && github.ref == 'refs/heads/master' - needs: [lint-code, lint-translations] + needs: [lint-code, lint-translations, lint-template-i18n] uses: ./.github/workflows/test-stack-reusable-workflow.yml with: python-version: '3.13' @@ -81,7 +131,7 @@ jobs: test-application-3-14: #if: github.event_name == 'push' && github.ref == 'refs/heads/master' - needs: [lint-code, lint-translations] + needs: [lint-code, lint-translations, lint-template-i18n] uses: ./.github/workflows/test-stack-reusable-workflow.yml with: python-version: '3.14' diff --git a/README.md b/README.md index 4eedf4e9..db10fe57 100644 --- a/README.md +++ b/README.md @@ -352,4 +352,6 @@ changedetectionio.html_tools.elementpath_tostring: Copyright (c), 2018-2021, SIS Recognition of fantastic contributors to the project +Developer note: see [translation guide](changedetectionio/translations/README.md) for i18n template patterns and workflow. + - Constantin Hong https://github.com/Constantin1489 diff --git a/changedetectionio/translations/README.md b/changedetectionio/translations/README.md index 3deb8583..35bfa2d4 100644 --- a/changedetectionio/translations/README.md +++ b/changedetectionio/translations/README.md @@ -1,103 +1,231 @@ -# Translation Guide +# Translators Guide -## Updating Translations +This document is for contributors who write templates (HTML) and for translators who maintain `.po` files. +It exists because fragmented `msgid`s — splitting a single sentence across multiple `_()` calls — cause +systematic translation breakage across many languages. Follow the patterns here to prevent that. -To maintain consistency and minimize unnecessary changes in translation files, run these commands: +--- -```bash -python setup.py extract_messages # Extract translatable strings -python setup.py update_catalog # Update all language files -python setup.py compile_catalog # Compile to binary .mo files +## Terminology + +- **Always use "monitor" or "watcher"** for the concept of watching a URL — never the bare word "watch", + which translates to "clock" (e.g. `hodinky` in Czech, `시계` in Korean, `時計` in Japanese). +- Use the **shortest suitable wording** for each language. If a language naturally uses the English + derivative, prefer that. + +--- + +## Template rules: do not fragment `msgid`s + +### Why fragments break translation + +The GNU gettext manual is explicit on this: + +> **[Entire sentences](https://www.gnu.org/software/gettext/manual/html_node/Entire-sentences.html)**: +> Translatable strings should be entire sentences. Because gender/number declension depends on other +> parts of the sentence, half-sentence *"dumb string concatenation"* breaks in many languages other than English. + +> **[No string concatenation](https://www.gnu.org/software/gettext/manual/html_node/No-string-concatenation.html)**: +> Placing adjacent `_()` calls is semantically equivalent to runtime `strcat` concatenation, so the same +> guideline applies. The manual also notes that "in some languages the translator might want to swap the +> order" of components. + +> **[No embedded URLs](https://www.gnu.org/software/gettext/manual/html_node/No-embedded-URLs.html)**: +> URLs should not be written directly inside `msgid`s; they should be injected via `%(name)s` placeholders +> and values passed as kwargs. + +> **[No unusual markup](https://www.gnu.org/software/gettext/manual/html_node/No-unusual-markup.html)**: +> "HTML markup, however, is common enough that it's probably ok to use in translatable strings." + +Fragments break differently depending on language family: + +| Language family | How fragmentation breaks it | +|---|---| +| SOV (Japanese, Korean, Turkish) | Verb-final word order can't be achieved when verb and subject are in separate fragments | +| Germanic (German) | Gender/case agreement between article and noun is lost across fragment boundaries | +| Romance (French, Spanish, Italian, Portuguese) | Adjective placement, subjunctive mood, verb agreement can't be maintained | +| Slavic (Czech, Ukrainian) | Case (driven by preposition/verb relationships) is easy to get wrong | +| CJK (Chinese, Japanese, Korean) | Modifier position and SVO-vs-topic-prominent differences can't be applied at fragment level | + +A past workaround was redistributing translations across adjacent fragments and using `msgstr " "` (a +single space) to suppress unused fragments. This is fragile: as soon as the same short `msgid` is reused +in a different template, the redistributed translation is applied verbatim and breaks the new context. + +--- + +## The four correct patterns + +### Pattern 1 — Inline HTML embedding + +Keep markup **inside** the `msgid`. Render with `| safe`. This also lets CJK translators decide how to +handle `` (see CJK section below). + +```jinja +{# BAD: three fragments; CJK translators can't see the at all #} +{{ _('Helps reduce changes detected caused by sites shuffling lines around, combine with') }} +{{ _('check unique lines') }} +{{ _('below.') }} + +{# GOOD: one msgid, rendered with |safe #} +{{ _('Helps reduce changes detected caused by sites shuffling lines around, combine with check unique lines below.') | safe }} ``` -## Configuration +### Pattern 2 — URL as kwarg -All translation settings are configured in **`../../setup.cfg`** (single source of truth). +Pass URLs via `%(name)s` so translators can freely reorder them. -The configuration below is shown for reference - **edit `setup.cfg` to change settings**: +```jinja +{# BAD: URL hardcoded between three fragments #} +{{ _('Use') }} +{{ _('AppRise Notification URLs') }} +{{ _('for notification to just about any service!') }} + +{# GOOD: URL passed as kwarg, embedded in the msgid #} +{{ _('Use AppRise Notification URLs for notification to just about any service!', + url='https://github.com/caronc/apprise') | safe }} +``` + +### Pattern 3 — Literal `{{}}` escape as kwarg + +Jinja2 would double-interpolate `{{token}}` inside a `_()` call. Pass it as a kwarg instead. + +```jinja +{# BAD: literal {{token}} in the middle forces splitting #} +{{ _('Accepts the') }} {{ '{{token}}' }} {{ _('placeholders listed below') }} + +{# GOOD: literal passed as kwarg; msgid stays as an entire sentence #} +{{ _('Accepts the %(token)s placeholders listed below', token='{{token}}') | safe }} +``` + +### Pattern 4 — `{% if %}` outside the `msgid` + +Move conditional branches outside `_()` so each branch is a complete sentence, not a fragment. + +```jinja +{# BAD: three fragments; SOV languages can't reorder %(title)s relative to "URL or Title" #} +{{ _('URL or Title') }}{% if active_tag_uuid %} {{ _('in') }} '{{ active_tag.title }}'{% endif %} + +{# GOOD: branch between two complete msgids; each language can freely reorder %(title)s #} +{% if active_tag_uuid %} + {{ _("URL or Title in '%(title)s'", title=active_tag.title) }} +{% else %} + {{ _('URL or Title') }} +{% endif %} +``` + +--- + +## CJK italic policy + +CJK fonts typically have no true italic cut — `` falls back to a mechanical slant that reduces +legibility. Now that `` is inside `msgid`s, CJK translators can handle it per-locale. Apply this policy +for `ja` / `zh` / `zh_Hant_TW`: + +| Context | Action | +|---|---| +| `` used for general emphasis | Replace with ``, or drop if the emphasis is self-evident | +| `...` | Collapse to `...` | +| `` wrapping a UI term (e.g. "check unique lines") | Wrap in locale-conventional quotation marks: 「」 for `ja`/`zh_Hant_TW`, `""` for `zh` | + +--- + +## Translation workflow + +**Always use these commands** — they read consistent settings from `setup.cfg` and produce minimal diffs: + +```bash +python setup.py extract_messages # Extract translatable strings from source +python setup.py update_catalog # Propagate new msgids to all .po files +python setup.py compile_catalog # Compile .po files to binary .mo files +``` + +Running `pybabel` directly without the project options causes reordering, rewrapping, and line-number +churn that makes diffs hard to review. + +### Configuration + +All translation settings are in `setup.cfg` (single source of truth): ```ini [extract_messages] -# Extract translatable strings from source code mapping_file = babel.cfg output_file = changedetectionio/translations/messages.pot input_paths = changedetectionio keywords = _ _l gettext -# Options to reduce unnecessary changes in .pot files sort_by_file = true # Keeps entries ordered by file path width = 120 # Consistent line width (prevents rewrapping) add_location = file # Show file path only (not line numbers) [update_catalog] -# Update existing .po files with new strings from .pot -# Note: 'locale' is omitted - Babel auto-discovers all catalogs in output_dir input_file = changedetectionio/translations/messages.pot output_dir = changedetectionio/translations domain = messages -# Options for consistent formatting -width = 120 # Consistent line width +width = 120 no_fuzzy_matching = true # Avoids incorrect automatic matches [compile_catalog] -# Compile .po files to .mo binary format directory = changedetectionio/translations domain = messages ``` -**Key formatting options:** -- `sort_by_file = true` - Orders entries by file path (consistent ordering) -- `width = 120` - Fixed line width prevents text rewrapping -- `add_location = file` - Shows file path only, not line numbers (reduces git churn) -- `no_fuzzy_matching = true` - Prevents incorrect automatic fuzzy matches +--- -## Why Use These Commands? +## Multi-language fix process -Running pybabel commands directly without consistent options causes: -- ❌ Entries get reordered differently each time -- ❌ Text gets rewrapped at different widths -- ❌ Line numbers change every edit (if not configured) -- ❌ Large diffs that make code review difficult +When you find a translation error in **any** language, you must check all others for the same `msgid`: -Using `python setup.py` commands ensures: -- ✅ Consistent ordering (by file path, not alphabetically) -- ✅ Consistent line width (120 characters, no rewrapping) -- ✅ File-only locations (no line number churn) -- ✅ No fuzzy matching (prevents incorrect auto-translations) -- ✅ Minimal diffs (only actual changes show up) -- ✅ Easier code review and git history +```bash +for lang in cs de en_GB en_US es fr it ja ko pt_BR tr uk zh zh_Hant_TW; do + echo "=== $lang ===" && grep -A1 'msgid "YourString"' changedetectionio/translations/$lang/LC_MESSAGES/messages.po +done +``` -These commands read settings from `../../setup.cfg` automatically. +1. Identify every language with the same problem +2. Fix all affected `.po` files in the same session +3. Recompile: `python setup.py compile_catalog` -## Supported Languages +Never fix one language and move on. -- `cs` - Czech (Čeština) -- `de` - German (Deutsch) -- `en_GB` - English (UK) -- `en_US` - English (US) -- `fr` - French (Français) -- `it` - Italian (Italiano) -- `ja` - Japanese (日本語) -- `ko` - Korean (한국어) -- `pt_BR` - Portuguese (Brasil) -- `zh` - Chinese Simplified (中文简体) -- `zh_Hant_TW` - Chinese Traditional (繁體中文) +--- -## Adding a New Language +## Supported languages -1. Initialize the new language catalog: - ```bash - pybabel init -i changedetectionio/translations/messages.pot -d changedetectionio/translations -l NEW_LANG_CODE - ``` -2. Compile it: - ```bash - python setup.py compile_catalog - ``` +| Code | Language | +|---|---| +| `cs` | Czech (Čeština) | +| `de` | German (Deutsch) | +| `en_GB` | English (UK) | +| `en_US` | English (US) | +| `es` | Spanish (Español) | +| `fr` | French (Français) | +| `it` | Italian (Italiano) | +| `ja` | Japanese (日本語) | +| `ko` | Korean (한국어) | +| `pt_BR` | Portuguese (Brasil) | +| `tr` | Turkish (Türkçe) | +| `uk` | Ukrainian (Українська) | +| `zh` | Chinese Simplified (中文简体) | +| `zh_Hant_TW` | Chinese Traditional (繁體中文) | -Babel will auto-discover the new language on subsequent translation updates. +## Adding a new language -## Translation Notes +```bash +pybabel init -i changedetectionio/translations/messages.pot \ + -d changedetectionio/translations \ + -l NEW_LANG_CODE +python setup.py compile_catalog +``` -From CLAUDE.md: -- Always use "monitor" or "watcher" terminology (not "clock") -- Use the most brief wording suitable -- When finding issues in one language, check ALL languages for the same issue +Babel auto-discovers the new language on subsequent runs. + +--- + +## CI linter + +A GitHub Actions job (`lint-template-i18n`) checks for adjacent `{{ _(...) }}` calls on the same line +separated only by HTML — the primary symptom of fragmented `msgid`s. It enforces a declining baseline: +the count of existing violations may only go down, never up. When you fix a template, lower the +`BASELINE_LIMIT` in `.github/workflows/test-only.yml` by the number of lines you fixed. + +See [issue #4074](https://github.com/dgtlmoon/changedetection.io/issues/4074) for full background and +[PR #4076](https://github.com/dgtlmoon/changedetection.io/pull/4076) for worked consolidation examples.