bump version

Ability to use our own plugins to scrape extra data
2026-01-09 08:40:32 +00:00 · 2024-07-30 17:21:48 +02:00 · 2024-07-30 17:10:21 +02:00
15 changed files with 96 additions and 94 deletions
--- a/changedetectionio/init.py
+++ b/changedetectionio/init.py
@@ -2,7 +2,7 @@

 # Read more https://github.com/dgtlmoon/changedetection.io/wiki

-__version__ = '0.46.04'
+__version__ = '0.46.02'

 from changedetectionio.strtobool import strtobool
 from json.decoder import JSONDecodeError
--- a/changedetectionio/blueprint/browser_steps/init.py
+++ b/changedetectionio/blueprint/browser_steps/init.py
@@ -85,8 +85,7 @@ def construct_blueprint(datastore: ChangeDetectionStore):
        browsersteps_start_session['browserstepper'] = browser_steps.browsersteps_live_ui(
            playwright_browser=browsersteps_start_session['browser'],
            proxy=proxy,
-            start_url=datastore.data['watching'][watch_uuid].get('url'),
-            headers=datastore.data['watching'][watch_uuid].get('headers')
+            start_url=datastore.data['watching'][watch_uuid].get('url')
        )

        # For test
--- a/changedetectionio/content_fetchers/base.py
+++ b/changedetectionio/content_fetchers/base.py
@@ -51,7 +51,6 @@ class Fetcher():
    instock_data = None
    instock_data_js = ""
    status_code = None
-    total_bytes = None
    webdriver_js_execute_code = None
    xpath_data = None
    xpath_element_js = ""
@@ -66,8 +65,8 @@ class Fetcher():

    def __init__(self):
        import importlib.resources
-        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text(encoding='utf-8')
-        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text(encoding='utf-8')
+        self.xpath_element_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('xpath_element_scraper.js').read_text()
+        self.instock_data_js = importlib.resources.files("changedetectionio.content_fetchers.res").joinpath('stock-not-in-stock.js').read_text()

    @abstractmethod
    def get_error(self):
--- a/changedetectionio/content_fetchers/requests.py
+++ b/changedetectionio/content_fetchers/requests.py
@@ -12,27 +12,6 @@ from changedetectionio.content_fetchers.base import Fetcher
 class fetcher(Fetcher):
    fetcher_description = "Basic fast Plaintext/HTTP Client"

-    def get_total_bytes_received(self, response):
-        # Calculate the size of the response content
-        content_size = len(response.content)
-        # Calculate the size of the response headers
-        headers_size = sum(len(k) + len(v) for k, v in response.headers.items()) + len(response.headers) * 4  # adding 4 for ': ' and '\r\n'
-
-        # Total bytes received
-        total_received = content_size + headers_size
-        return total_received
-
-    def get_total_bytes_transferred(self, request):
-        # Calculate the size of the request headers
-        headers_size = sum(len(k) + len(v) for k, v in request.headers.items()) + len(request.headers) * 4  # adding 4 for ': ' and '\r\n'
-
-        # Calculate the size of the request body, if any
-        body_size = len(request.body or '')
-
-        # Total bytes transferred (request + response)
-        total_transferred = headers_size + body_size
-        return total_transferred
-
    def __init__(self, proxy_override=None, custom_browser_connection_url=None):
        super().__init__()
        self.proxy_override = proxy_override
@@ -81,10 +60,6 @@ class fetcher(Fetcher):
                            proxies=proxies,
                            verify=False)

-        total_received = self.get_total_bytes_received(response=r)
-        request_prepared = r.request
-        self.total_bytes = self.get_total_bytes_transferred(request_prepared) + total_received
-
        # If the response did not tell us what encoding format to expect, Then use chardet to override what `requests` thinks.
        # For example - some sites don't tell us it's utf-8, but return utf-8 content
        # This seems to not occur when using webdriver/selenium, it seems to detect the text encoding more reliably.
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@@ -75,7 +75,6 @@ function isItemInStock() {
        'vergriffen',
        'vorbestellen',
        'vorbestellung ist bald möglich',
-        'we don\'t currently have any',
        'we couldn\'t find any products that match',
        'we do not currently have an estimate of when this product will be back in stock.',
        'we don\'t know when or if this item will be back in stock.',
@@ -174,8 +173,7 @@ function isItemInStock() {
        const element = elementsToScan[i];
        // outside the 'fold' or some weird text in the heading area
        // .getBoundingClientRect() was causing a crash in chrome 119, can only be run on contentVisibility != hidden
-        // Note: theres also an automated test that places the 'out of stock' text fairly low down
-        if (element.getBoundingClientRect().top + window.scrollY >= vh + 250 || element.getBoundingClientRect().top + window.scrollY <= 100) {
+        if (element.getBoundingClientRect().top + window.scrollY >= vh + 150 || element.getBoundingClientRect().top + window.scrollY <= 100) {
            continue
        }
        elementText = "";
@@ -189,7 +187,7 @@ function isItemInStock() {
            // and these mean its out of stock
            for (const outOfStockText of outOfStockTexts) {
                if (elementText.includes(outOfStockText)) {
-                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}" - offset top ${element.getBoundingClientRect().top}, page height is ${vh}`)
+                    console.log(`Selected 'Out of Stock' - found text "${outOfStockText}" - "${elementText}"`)
                    return outOfStockText; // item is out of stock
                }
            }
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -164,15 +164,6 @@ visibleElementsArray.forEach(function (element) {
        }
    }

-    let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
-
-    let text = element.textContent.trim().slice(0, 30).trim();
-    while (/\n{2,}|\t{2,}/.test(text)) {
-        text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
-    }
-
-    // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
-    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;

    size_pos.push({
        xpath: xpath_result,
@@ -180,16 +171,9 @@ visibleElementsArray.forEach(function (element) {
        height: Math.round(bbox['height']),
        left: Math.floor(bbox['left']),
        top: Math.floor(bbox['top']) + scroll_y,
-        // tagName used by Browser Steps
        tagName: (element.tagName) ? element.tagName.toLowerCase() : '',
-        // tagtype used by Browser Steps
        tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
-        isClickable: window.getComputedStyle(element).cursor === "pointer",
-        // Used by the keras trainer
-        fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
-        fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
-        hasDigitCurrency: hasDigitCurrency,
-        label: label,
+        isClickable: window.getComputedStyle(element).cursor == "pointer"
    });

 });
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -1377,19 +1377,17 @@ def changedetection_app(config=None, datastore_o=None):
        import brotli

        watch = datastore.data['watching'].get(uuid)
-        if watch and watch.history.keys() and os.path.isdir(watch.watch_data_dir):
-            latest_filename = list(watch.history.keys())[-1]
+        if watch and os.path.isdir(watch.watch_data_dir):
+            latest_filename = list(watch.history.keys())[0]
            html_fname = os.path.join(watch.watch_data_dir, f"{latest_filename}.html.br")
-            with open(html_fname, 'rb') as f:
-                if html_fname.endswith('.br'):
-                    # Read and decompress the Brotli file
+            if html_fname.endswith('.br'):
+                # Read and decompress the Brotli file
+                with open(html_fname, 'rb') as f:
                    decompressed_data = brotli.decompress(f.read())
-                else:
-                    decompressed_data = f.read()

-            buffer = BytesIO(decompressed_data)
+                buffer = BytesIO(decompressed_data)

-            return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')
+                return send_file(buffer, as_attachment=True, download_name=f"{latest_filename}.html", mimetype='text/html')


        # Return a 500 error
--- a/changedetectionio/processors/restock_diff/init.py
+++ b/changedetectionio/processors/restock_diff/init.py
@@ -1,12 +1,11 @@

-from babel.numbers import parse_decimal
 from changedetectionio.model.Watch import model as BaseWatch
-from typing import Union
 import re
+from babel.numbers import parse_decimal

 class Restock(dict):

-    def parse_currency(self, raw_value: str) -> Union[float, None]:
+    def parse_currency(self, raw_value: str) -> float:
        # Clean and standardize the value (ie 1,400.00 should be 1400.00), even better would be store the whole thing as an integer.
        standardized_value = raw_value

@@ -22,11 +21,8 @@ class Restock(dict):
        # Remove any non-numeric characters except for the decimal point
        standardized_value = re.sub(r'[^\d.-]', '', standardized_value)

-        if standardized_value:
-            # Convert to float
-            return float(parse_decimal(standardized_value, locale='en'))
-
-        return None
+        # Convert to float
+        return float(parse_decimal(standardized_value, locale='en'))

    def __init__(self, *args, **kwargs):
        # Define default values
--- a/changedetectionio/processors/restock_diff/hookspecs.py
+++ b/changedetectionio/processors/restock_diff/hookspecs.py
@@ -0,0 +1,23 @@
+import pluggy
+from typing import Dict
+from changedetectionio.model import Watch as Watch
+
+plugin_namespace = "changedetectionio.restock_price_scraper"
+hookspec = pluggy.HookspecMarker(plugin_namespace)
+
+class HookSpec:
+    @hookspec
+    def scrape_price_restock(self, watch: Watch.model, html_content: str, screenshot: bytes, update_obj: Dict) -> Dict:
+        """
+         Scrape price and restock data from html_content and/or screenshot and return via update_obj
+
+         Args:
+             watch (Watch.model): The watch object containing watch configuration.
+             html_content (str): The HTML content to scrape.
+             screenshot (bytes): The screenshot data.
+             update_obj (Dict): The dictionary to update with scraped data.
+
+         Returns:
+             Optional[Dict]: The updated dictionary with the scraped price data, or None if no update is made.
+         """
+
--- a/changedetectionio/processors/restock_diff/plugin_manager.py
+++ b/changedetectionio/processors/restock_diff/plugin_manager.py
@@ -0,0 +1,17 @@
+import pluggy
+from .hookspecs import HookSpec
+import importlib.metadata
+
+# Define the plugin namespace
+plugin_namespace = "changedetectionio.restock_price_scraper"
+
+# Create a pluggy.PluginManager instance
+pm = pluggy.PluginManager(plugin_namespace)
+
+# Register the hook specifications
+pm.add_hookspecs(HookSpec)
+
+# Automatically discover and register plugins using entry points
+for entry_point in importlib.metadata.entry_points().get(plugin_namespace, []):
+    plugin = entry_point.load()
+    pm.register(plugin())
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -40,16 +40,13 @@ def get_itemprop_availability(html_content) -> Restock:
    import extruct
    logger.trace(f"Imported extruct module in {time.time() - now:.3f}s")

+    value = {}
    now = time.time()
-
    # Extruct is very slow, I'm wondering if some ML is going to be faster (800ms on my i7), 'rdfa' seems to be the heaviest.
-    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
-    try:
-        data = extruct.extract(html_content, syntaxes=syntaxes)
-    except Exception as e:
-        logger.warning(f"Unable to extract data, document parsing with extruct failed with {type(e).__name__} - {str(e)}")
-        return Restock()

+    syntaxes = ['dublincore', 'json-ld', 'microdata', 'microformat', 'opengraph']
+
+    data = extruct.extract(html_content, syntaxes=syntaxes)
    logger.trace(f"Extruct basic extract of all metadata done in {time.time() - now:.3f}s")

    # First phase, dead simple scanning of anything that looks useful
@@ -122,6 +119,8 @@ class perform_site_check(difference_detection_processor):
    xpath_data = None

    def run_changedetection(self, watch, skip_when_checksum_same=True):
+        from .plugin_manager import pm
+
        if not watch:
            raise Exception("Watch no longer exists.")

@@ -201,6 +200,19 @@ class perform_site_check(difference_detection_processor):
            update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
            logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned '{self.fetcher.instock_data}' from JS scraper.")

+        # Ask any "changedetectionio.restock_price_scraper" namespace plugins if they can add something
+        # (Should return an updated 'update_obj')
+        plugin_price_scraping = pm.hook.scrape_price_restock(watch=watch,
+                                                             html_content=self.fetcher.content,
+                                                             screenshot=self.fetcher.screenshot,
+                                                             update_obj=update_obj)
+        if plugin_price_scraping:
+            for plugin_result in plugin_price_scraping:
+                update_obj.update(plugin_result)
+                if plugin_result.get('restock'):
+                    update_obj['restock'].update(plugin_result.get('restock'))
+
+
        # What we store in the snapshot
        price = update_obj.get('restock').get('price') if update_obj.get('restock').get('price') else ""
        snapshot_content = f"In Stock: {update_obj.get('restock').get('in_stock')} - Price: {price}"
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -168,7 +168,7 @@
                        {% if watch.get('restock') and watch['restock']['price'] != None %}
                            {% if watch['restock']['price'] != None %}
                                <span class="restock-label price" title="Price">
-                                {{ watch['restock']['price']|format_number_locale }} {{ watch['restock']['currency'] }}
+                                {{ watch['restock']['price']|format_number_locale }} {% if watch['restock']['currency'] %} {{ watch['restock']['currency'] }}{% endif %}
                                </span>
                            {% endif %}
                        {% elif not watch.has_restock_info %}
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -69,12 +69,6 @@ def test_check_basic_change_detection_functionality(client, live_server, measure

    wait_for_all_checks(client)

-    uuid = extract_UUID_from_client(client)
-
-    # Check the 'get latest snapshot works'
-    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
-    assert b'which has this one new line' in res.data
-
    # Now something should be ready, indicated by having a 'unviewed' class
    res = client.get(url_for("index"))
    assert b'unviewed' in res.data
@@ -92,7 +86,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert expected_url.encode('utf-8') in res.data

    # Following the 'diff' link, it should no longer display as 'unviewed' even after we recheck it a few times
-    res = client.get(url_for("diff_history_page", uuid=uuid))
+    res = client.get(url_for("diff_history_page", uuid="first"))
    assert b'selected=""' in res.data, "Confirm diff history page loaded"

    # Check the [preview] pulls the right one
@@ -149,12 +143,18 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
    assert b'unviewed' not in res.data

    # #2458 "clear history" should make the Watch object update its status correctly when the first snapshot lands again
+    uuid = extract_UUID_from_client(client)
    client.get(url_for("clear_watch_history", uuid=uuid))
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
    wait_for_all_checks(client)
    res = client.get(url_for("index"))
    assert b'preview/' in res.data

+
+    # Check the 'get latest snapshot works'
+    res = client.get(url_for("watch_get_latest_html", uuid=uuid))
+    assert b'<head><title>head title</title></head>' in res.data
+
    #
    # Cleanup everything
    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -18,7 +18,7 @@ services:
  #
  #        Log levels are in descending order. (TRACE is the most detailed one)
  #        Log output levels: TRACE, DEBUG(default), INFO, SUCCESS, WARNING, ERROR, CRITICAL
-  #      - LOGGER_LEVEL=TRACE
+  #      - LOGGER_LEVEL=DEBUG
  #
  #       Alternative WebDriver/selenium URL, do not use "'s or 's!
  #      - WEBDRIVER_URL=http://browser-chrome:4444/wd/hub
@@ -29,9 +29,8 @@ services:
  #
  #             https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.proxy
  #
-  #       Alternative target "Chrome" Playwright URL, do not use "'s or 's!
-  #       "Playwright" is a driver/librarythat allows changedetection to talk to a Chrome or similar browser.
-  #      - PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000
+  #       Alternative Playwright URL, do not use "'s or 's!
+  #      - PLAYWRIGHT_DRIVER_URL=ws://playwright-chrome:3000
  #
  #       Playwright proxy settings playwright_proxy_server, playwright_proxy_bypass, playwright_proxy_username, playwright_proxy_password
  #
@@ -74,10 +73,10 @@ services:
 #              condition: service_started


-     # Sockpuppetbrowser is basically chrome wrapped in an API for allowing fast fetching of web-pages.
+     # Used for fetching pages via Playwright+Chrome where you need Javascript support.
     # RECOMMENDED FOR FETCHING PAGES WITH CHROME
-#    sockpuppetbrowser:
-#        hostname: sockpuppetbrowser
+#    playwright-chrome:
+#        hostname: playwright-chrome
 #        image: dgtlmoon/sockpuppetbrowser:latest
 #        cap_add:
 #            - SYS_ADMIN
--- a/requirements.txt
+++ b/requirements.txt
@@ -79,9 +79,8 @@ pyppeteerstealth>=0.0.4
 pytest ~=7.2
 pytest-flask ~=1.2

-# Anything 4.0 and up but not 5.0
-jsonschema ~= 4.0
-
+# Pin jsonschema version to prevent build errors on armv6 while rpds-py wheels aren't available (1708)
+jsonschema==4.17.3

 loguru

@@ -93,3 +92,6 @@ babel

 # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
 greenlet >= 3.0.3
+
+# Our own plugins
+changedetection.io-amazon-price-scraper>=0.03
Author	SHA1	Message	Date
dgtlmoon	29a1651ae1	bump version	2024-07-30 17:21:48 +02:00
dgtlmoon	321ab19ffb	Ability to use our own plugins to scrape extra data	2024-07-30 17:10:21 +02:00