Re #2568 Add encoding for scraper script reader

2026-01-10 17:20:26 +00:00 · 2024-08-18 17:52:25 +02:00
10 changed files with 35 additions and 65 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.46.03'
+__version__ = '0.46.02'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/blueprint/browser_steps/init.py
+++ b/changedetectionio/blueprint/browser_steps/init.py
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
            playwright_browser=browsersteps_start_session['browser'],
            proxy=proxy,
-            start_url=datastore.data['watching'][watch_uuid].get('url'),
-            headers=datastore.data['watching'][watch_uuid].get('headers')
+            start_url=datastore.data['watching'][watch_uuid].get('url')
        )

        # For test
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@@ -75,7 +75,6 @@ function isItemInStock() {
        'vergriffen',
        'vorbestellen',
        'vorbestellung ist bald möglich',
-        'we don\'t currently have any',
        'we couldn\'t find any products that match',
        'we do not currently have an estimate of when this product will be back in stock.',
        'we don\'t know when or if this item will be back in stock.',
@@ -174,8 +173,7 @@ function isItemInStock() {
        const element = elementsToScan[i];
        // outside the 'fold' or some weird text in the heading area
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
-        // Note: theres also an automated test that places the 'out of stock' text fairly low down
-        if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
+        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
            continue
        }
        elementText = "";
@@ -189,7 +187,7 @@ function isItemInStock() {
            // and these mean its out of stock
            for (const outOfStockText of outOfStockTexts) {
                if (elementText.includes(outOfStockText)) {
-                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
+                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
                    return outOfStockText; // item is out of stock
                }
            }
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
        }
    }

-    let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
-
-    let text = element.textContent.trim().slice(0, 30).trim();
-    while (/\n{2,}|\t{2,}/.test(text)) {
-        text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
-    }
-
-    // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
-    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;

    size_pos.push({
        xpath: xpath_result,
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
        height: Math.round(bbox['height']),
        left: Math.floor(bbox['left']),
        top: Math.floor(bbox['top']) + scroll_y,
-        // tagName used by Browser Steps
        tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
-        // tagtype used by Browser Steps
        tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
-        isClickable: window.getComputedStyle(element).cursor === "pointer",
-        // Used by the keras trainer
-        fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
-        fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
-        hasDigitCurrency: hasDigitCurrency,
-        label: label,
+        isClickable: window.getComputedStyle(element).cursor == "pointer"
    });

 });
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -1377,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
        import brotli

        watch = datastore.data['watching'].get(uuid)
-        if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
-            latest_filename = list(watch.history.keys())[-1]
+        if watch and os.path.isdir(watch.watch_data_dir):
+            latest_filename = list(watch.history.keys())[0]
            html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
-            with open(html_fname, 'rb') as f:
-                if html_fname.endswith('.br'):
-                    # Read and decompress the Brotli file
+            if html_fname.endswith('.br'):
+                # Read and decompress the Brotli file
+                with open(html_fname, 'rb') as f:
                    decompressed_data = brotli.decompress(f.read())
-                else:
-                    decompressed_data = f.read()

-            buffer = BytesIO(decompressed_data)
+                buffer = BytesIO(decompressed_data)

-            return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
+                return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')


        # Return a 500 error
--- a/changedetectionio/processors/restock_diff/init.py
+++ b/changedetectionio/processors/restock_diff/init.py
@@ -1,12 +1,11 @@

-from babel.numbers import parse_decimal
 from changedetectionio.model.Watch import model as BaseWatch
-from typing import Union
 import re
+from babel.numbers import parse_decimal

 class Restock(dict):

-    def parse_currency(self, raw_value: str) -> Union[float, None]:
+    def parse_currency(self, raw_value: str) -> float:
        # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
        standardized_value = raw_value

@@ -22,11 +21,8 @@ class Restock(dict):
        # Remove any non-numeric characters except for the decimal point
        standardized_value = re.sub(r'[^\d.-]', '', standardized_value)

-        if standardized_value:
-            # Convert to float
-            return float(parse_decimal(standardized_value, locale='en'))
-
-        return None
+        # Convert to float
+        return float(parse_decimal(standardized_value, locale='en'))

    def __init__(self, *args, **kwargs):
        # Define default values
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")

+    value = {}
    now = time.time()
-
    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
-    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
-    try:
-        data = extruct.extract(html_content, syntaxes=syntaxes)
-    except Exception as e:
-        logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
-        return Restock()

+    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
+
+    data = extruct.extract(html_content, syntaxes=syntaxes)
    logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")

    # First phase, dead simple scanning of anything that looks useful
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

    wait_for_all_checks(client)

-    uuid = extract_UUID_from_client(client)
-
-    # Check the 'get latest snapshot works'
-    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
-    assert b'which has this one new line' in res.data
-
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert expected_url.encode('utf-8') in res.data

    # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
-    res = client.get(url_for("diff_history_page", uuid=uuid))
+    res = client.get(url_for("diff_history_page", uuid="first"))
    assert b'selected=""' in res.data, "Confirm diff history page loaded"

    # Check the [preview] pulls the right one
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b'unviewed' not in res.data

    # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
+    uuid = extract_UUID_from_client(client)
    client.get(url_for("clear_watch_history", uuid=uuid))
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'preview/' in res.data

+
+    # Check the 'get latest snapshot works'
+    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
+    assert b'<head><title>head title</title></head>' in res.data
+
    #
    # Cleanup everything
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -18,7 +18,7 @@ services:
  #
  #        Log levels are in descending order. (TRACE is the most detailed one)
  #        Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
-  #      - LOGGER_LEVEL=TRACE
+  #      - LOGGER_LEVEL=DEBUG
  #
  #       Alternative WebDriver/selenium URL, do not use "'s or 's!
  #      - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,9 +29,8 @@ services:
  #
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
-  #       Alternative target "Chrome" Playwright URL, do not use "'s or 's!
-  #       "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
-  #      - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
+  #       Alternative Playwright URL, do not use "'s or 's!
+  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@@ -74,10 +73,10 @@ services:
 #              condition: service_started


-     # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
+     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
-#    sockpuppetbrowser:
-#        hostname: sockpuppetbrowser
+#    playwright-chrome:
+#        hostname: playwright-chrome
 #        image: dgtlmoon/sockpuppetbrowser:latest
 #        cap_add:
 #            - SYS_ADMIN
--- a/requirements.txt
+++ b/requirements.txt
@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
 pytest ~=7.2
 pytest-flask ~=1.2

-# Anything 4.0 and up but not 5.0
-jsonschema ~= 4.0
-
+# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
+jsonschema==4.17.3

 loguru