Re #3337 - Various fixes for 'Extract Data'

2025-11-12 12:36:48 +00:00 · 2025-07-28 17:38:42 +02:00
13 changed files with 50 additions and 365 deletions
--- a/changedetectionio/blueprint/settings/templates/settings.html
+++ b/changedetectionio/blueprint/settings/templates/settings.html
@@ -199,14 +199,6 @@ nav
                        </ul>
                     </span>
                    </fieldset>
-                    <fieldset class="pure-group">
-                        {{ render_field(form.application.form.custom_outofstock_strings) }}
-                        <span class="pure-form-message-inline">Additional custom out-of-stock detection strings (one per line).</span>
-                        </fieldset>
-                        <fieldset class="pure-group">
-                        {{ render_field(form.application.form.custom_instock_strings) }}
-                        <span class="pure-form-message-inline">Additional custom in-stock detection strings (one per line).</span>
-                    </fieldset>
           </div>

            <div class="tab-pane-inner" id="api">
--- a/changedetectionio/blueprint/ui/views.py
+++ b/changedetectionio/blueprint/ui/views.py
@@ -93,12 +93,15 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
            return redirect(url_for('watchlist.index'))

        # For submission of requesting an extract
-        extract_form = forms.extractDataForm(request.form)
+        extract_form = forms.extractDataForm(formdata=request.form,
+                                             data={'extract_regex': request.form.get('extract_regex', '')}
+                                             )
        if not extract_form.validate():
            flash("An error occurred, please see below.", "error")
+            return _render_diff_template(uuid, extract_form)

        else:
-            extract_regex = request.form.get('extract_regex').strip()
+            extract_regex = request.form.get('extract_regex', '').strip()
            output = watch.extract_regex_from_all_history(extract_regex)
            if output:
                watch_dir = os.path.join(datastore.datastore_path, uuid)
@@ -109,12 +112,11 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
                response.headers['Expires'] = "0"
                return response

-            flash('Nothing matches that RegEx', 'error')
-        redirect(url_for('ui_views.diff_history_page', uuid=uuid) + '#extract')
+            flash('No matches found while scanning all of the watch history for that RegEx.', 'error')
+        return redirect(url_for('ui.ui_views.diff_history_page', uuid=uuid) + '#extract')

-    @views_blueprint.route("/diff/<string:uuid>", methods=['GET'])
-    @login_optionally_required
-    def diff_history_page(uuid):
+    def _render_diff_template(uuid, extract_form=None):
+        """Helper function to render the diff template with all required data"""
        from changedetectionio import forms

        # More for testing, possible to return the first/only
@@ -128,8 +130,11 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
            flash("No history found for the specified link, bad link?", "error")
            return redirect(url_for('watchlist.index'))

-        # For submission of requesting an extract
-        extract_form = forms.extractDataForm(request.form)
+        # Use provided form or create a new one
+        if extract_form is None:
+            extract_form = forms.extractDataForm(formdata=request.form,
+                                                 data={'extract_regex': request.form.get('extract_regex', '')}
+                                                 )

        history = watch.history
        dates = list(history.keys())
@@ -170,7 +175,7 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe

        datastore.set_last_viewed(uuid, time.time())

-        output = render_template("diff.html",
+        return render_template("diff.html",
                                 current_diff_url=watch['url'],
                                 from_version=str(from_version),
                                 to_version=str(to_version),
@@ -193,7 +198,10 @@ def construct_blueprint(datastore: ChangeDetectionStore, update_q, queuedWatchMe
                                 watch_a=watch
                                 )

-        return output
+    @views_blueprint.route("/diff/<string:uuid>", methods=['GET'])
+    @login_optionally_required
+    def diff_history_page(uuid):
+        return _render_diff_template(uuid)

    @views_blueprint.route("/form/add/quickwatch", methods=['POST'])
    @login_optionally_required
--- a/changedetectionio/content_fetchers/res/stock-not-in-stock.js
+++ b/changedetectionio/content_fetchers/res/stock-not-in-stock.js
@@ -1,8 +1,8 @@
-async (customOutOfStockStrings = []) => {
+async () => {

    function isItemInStock() {
        // @todo Pass these in so the same list can be used in non-JS fetchers
-        const builtInOutOfStockTexts = [
+        const outOfStockTexts = [
            ' أخبرني عندما يتوفر',
            '0 in stock',
            'actuellement indisponible',
@@ -110,9 +110,6 @@ async (customOutOfStockStrings = []) => {
            '품절'
        ];

-        // Combine built-in strings with custom strings provided by user
-        const outOfStockTexts = [...builtInOutOfStockTexts, ...customOutOfStockStrings];
-

        const vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0);

--- a/changedetectionio/forms.py
+++ b/changedetectionio/forms.py
@@ -396,6 +396,19 @@ def validate_url(test_url):
        # This should be wtforms.validators.
        raise ValidationError('Watch protocol is not permitted by SAFE_PROTOCOL_REGEX or incorrect URL format')

+
+class ValidateSinglePythonRegexString(object):
+    def __init__(self, message=None):
+        self.message = message
+
+    def __call__(self, form, field):
+        try:
+            re.compile(field.data)
+        except re.error:
+            message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
+            raise ValidationError(message % (field.data))
+
+
 class ValidateListRegex(object):
    """
    Validates that anything that looks like a regex passes as a regex
@@ -414,6 +427,7 @@ class ValidateListRegex(object):
                    message = field.gettext('RegEx \'%s\' is not a valid regular expression.')
                    raise ValidationError(message % (line))

+
 class ValidateCSSJSONXPATHInput(object):
    """
    Filter validation
@@ -774,20 +788,6 @@ class globalSettingsApplicationForm(commonSettingsForm):
                                                                                                     message="Should contain zero or more attempts")])
    ui = FormField(globalSettingsApplicationUIForm)

-    #@todo better validations?
-
-    custom_outofstock_strings = StringListField('Custom out-of-stock detection strings',
-                                              [validators.Optional()],
-                                              render_kw={
-                                                  "placeholder": "Enter custom out-of-stock strings, one per line\nExample:\nPronto estarán en stock!\nTemporarily out of stock",
-                                                  "rows": "3"})
-
-    custom_instock_strings = StringListField('Custom in-stock detection strings',
-                                           [validators.Optional()],
-                                           render_kw={
-                                               "placeholder": "Enter custom in-stock strings, one per line\nExample:\nDisponible ahora\nIn voorraad",
-                                               "rows": "3"})
-

 class globalSettingsForm(Form):
    # Define these as FormFields/"sub forms", this way it matches the JSON storage
@@ -805,5 +805,5 @@ class globalSettingsForm(Form):


 class extractDataForm(Form):
-    extract_regex = StringField('RegEx to extract', validators=[validators.Length(min=1, message="Needs a RegEx")])
+    extract_regex = StringField('RegEx to extract', validators=[validators.DataRequired(), ValidateSinglePythonRegexString()])
    extract_submit_button = SubmitField('Extract as CSV', render_kw={"class": "pure-button pure-button-primary"})
--- a/changedetectionio/model/App.py
+++ b/changedetectionio/model/App.py
@@ -38,8 +38,6 @@ class model(dict):
                    # Custom notification content
                    'api_access_token_enabled': True,
                    'base_url' : None,
-                    'custom_instock_strings': [],
-                    'custom_outofstock_strings' : [],
                    'empty_pages_are_a_change': False,
                    'extract_title_as_title': False,
                    'fetch_backend': getenv("DEFAULT_FETCH_BACKEND", "html_requests"),
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -639,7 +639,7 @@ class model(watch_base):
                    if res:
                        if not csv_writer:
                            # A file on the disk can be transferred much faster via flask than a string reply
-                            csv_output_filename = 'report.csv'
+                            csv_output_filename = f"report-{self.get('uuid')}.csv"
                            f = open(os.path.join(self.watch_data_dir, csv_output_filename), 'w')
                            # @todo some headers in the future
                            #fieldnames = ['Epoch seconds', 'Date']
--- a/changedetectionio/processors/restock_diff/forms.py
+++ b/changedetectionio/processors/restock_diff/forms.py
@@ -1,8 +1,7 @@
 from wtforms import (
    BooleanField,
    validators,
-    FloatField,
-    TextAreaField
+    FloatField
 )
 from wtforms.fields.choices import RadioField
 from wtforms.fields.form import FormField
@@ -30,7 +29,6 @@ class RestockSettingsForm(Form):

    follow_price_changes = BooleanField('Follow price changes', default=True)

-
 class processor_settings_form(processor_text_json_diff_form):
    restock_settings = FormField(RestockSettingsForm)

@@ -76,7 +74,7 @@ class processor_settings_form(processor_text_json_diff_form):
                    {{ render_field(form.restock_settings.price_change_threshold_percent) }}
                    <span class="pure-form-message-inline">Price must change more than this % to trigger a change since the first check.</span><br>
                    <span class="pure-form-message-inline">For example, If the product is $1,000 USD originally, <strong>2%</strong> would mean it has to change more than $20 since the first check.</span><br>
-                </fieldset>           
+                </fieldset>                
            </div>
        </fieldset>
        """
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -143,89 +143,6 @@ def is_between(number, lower=None, upper=None):
 class perform_site_check(difference_detection_processor):
    screenshot = None
    xpath_data = None
-    
-    def _normalize_text_for_matching(self, text):
-        """
-        Normalize text for more robust matching:
-        - Convert to lowercase
-        - Remove accents/diacritics  
-        - Normalize whitespace
-        """
-        import unicodedata
-        import re
-        
-        if not text:
-            return ""
-            
-        # Convert to lowercase
-        text = text.lower()
-        
-        # Remove accents/diacritics (NFD normalization + filter)
-        # This converts "é" to "e", "ñ" to "n", etc.
-        text = unicodedata.normalize('NFD', text)
-        text = ''.join(char for char in text if unicodedata.category(char) != 'Mn')
-        
-        # Normalize whitespace (replace multiple spaces/tabs/newlines with single space)
-        text = re.sub(r'\s+', ' ', text).strip()
-        
-        return text
-
-    def _check_custom_strings(self, text_to_check, custom_strings, string_type="out-of-stock"):
-        """
-        Check text against custom strings (either in-stock or out-of-stock).
-        Uses normalized matching for better international support.
-        Returns the matched string if found, None otherwise.
-        """
-        if not custom_strings:
-            return None
-            
-        # Split custom strings by newlines and clean them up
-        raw_custom_list = [s.strip() for s in custom_strings.split('\n') if s.strip()]
-        
-        if not raw_custom_list:
-            return None
-            
-        # Normalize both the page text and custom strings for matching
-        normalized_text = self._normalize_text_for_matching(text_to_check)
-        
-        # Check each custom string against the text
-        for original_custom_text in raw_custom_list:
-            normalized_custom_text = self._normalize_text_for_matching(original_custom_text)
-            
-            if normalized_custom_text and normalized_custom_text in normalized_text:
-                logger.debug(f"Custom {string_type} string found: '{original_custom_text}' (normalized: '{normalized_custom_text}')")
-                return original_custom_text  # Return the original user-provided string
-                
-        return None
-    
-    def _get_combined_instock_strings(self, restock_settings):
-        """
-        Get combined list of built-in and custom in-stock strings.
-        Custom strings are normalized for better matching.
-        """
-        # Built-in in-stock strings (from the TODO line)
-        builtin_instock_strings = [
-            'instock',
-            'instoreonly', 
-            'limitedavailability',
-            'onlineonly',
-            'presale'
-        ]
-        
-        # Add custom in-stock strings if provided
-        custom_strings = restock_settings.get('custom_instock_strings', '').strip()
-        if custom_strings:
-            # Normalize custom strings for better matching
-            custom_list = []
-            for s in custom_strings.split('\n'):
-                s = s.strip()
-                if s:
-                    normalized = self._normalize_text_for_matching(s)
-                    if normalized:
-                        custom_list.append(normalized)
-            builtin_instock_strings.extend(custom_list)
-            
-        return builtin_instock_strings

    def run_changedetection(self, watch):
        import hashlib
@@ -288,7 +205,6 @@ class perform_site_check(difference_detection_processor):

            if itemprop_availability.get('availability'):
                # @todo: Configurable?
-
                if any(substring.lower() in itemprop_availability['availability'].lower() for substring in [
                    'instock',
                    'instoreonly',
@@ -322,8 +238,6 @@ class perform_site_check(difference_detection_processor):
        if self.fetcher.instock_data and itemprop_availability.get('availability') is None:
            # 'Possibly in stock' comes from stock-not-in-stock.js when no string found above the fold.
            # Careful! this does not really come from chrome/js when the watch is set to plaintext
-            stock_detection_result = self.fetcher.instock_data
-
            update_obj['restock']["in_stock"] = True if self.fetcher.instock_data == 'Possibly in stock' else False
            logger.debug(f"Watch UUID {watch.get('uuid')} restock check returned instock_data - '{self.fetcher.instock_data}' from JS scraper.")

--- a/changedetectionio/static/styles/scss/parts/_lister_extra.scss
+++ b/changedetectionio/static/styles/scss/parts/_lister_extra.scss
@@ -6,19 +6,19 @@
      }
    }
  }
-
-  tr {
-    /* make the icons and the text inline-ish */
-    td.inline.title-col {
-      .flex-wrapper {
-        display: flex;
-        align-items: center;
-        gap: 4px;
+  &.favicon-enabled {
+    tr {
+      /* make the icons and the text inline-ish */
+      td.inline.title-col {
+        .flex-wrapper {
+          display: flex;
+          align-items: center;
+          gap: 4px;
+        }
      }
    }
  }

-
  td,
  th {
    vertical-align: middle;
--- a/changedetectionio/static/styles/styles.css
+++ b/changedetectionio/static/styles/styles.css
--- a/changedetectionio/tests/restock/test_restock.py
+++ b/changedetectionio/tests/restock/test_restock.py
@@ -111,130 +111,3 @@ def test_restock_detection(client, live_server, measure_memory_usage):
    res = client.get(url_for("watchlist.index"))
    assert b'not-in-stock' in res.data, "Correctly showing NOT IN STOCK in the list after it changed from IN STOCK"

-
-def test_restock_custom_strings(client, live_server):
-    """Test custom out-of-stock strings feature"""
-    
-    # Set up a response with custom out-of-stock text
-    test_return_data = """<html>
-       <body>
-       Some initial text<br>
-       <p>Which is across multiple lines</p>
-       <br>
-       So let's see what happens.  <br>
-       <div>price: $10.99</div>
-       <div id="custom">Pronto estarán en stock!</div>
-       </body>
-       </html>
-    """
-    
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write(test_return_data)
-    
-    test_url = url_for('test_endpoint', _external=True).replace('http://localhost', 'http://changedet')
-
-    # Add watch with custom out-of-stock strings
-    res = client.post(
-        url_for("ui.ui_views.form_quick_watch_add"),
-        data={"url": test_url, "tags": '', 'processor': 'restock_diff'},
-        follow_redirects=True
-    )
-    
-    # Get the UUID so we can configure the watch
-    uuid = extract_UUID_from_client(client)
-    
-    # Configure custom out-of-stock strings
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
-        data={
-            "url": test_url,
-            'processor': 'restock_diff',
-            'restock_settings-custom_outofstock_strings': 'Pronto estarán en stock!\nCustom unavailable message'
-        },
-        follow_redirects=True
-    )
-    assert b"Updated watch." in res.data
-    
-    # Check that it detects as out of stock
-    wait_for_all_checks(client)
-    res = client.get(url_for("watchlist.index"))
-    assert b'not-in-stock' in res.data, "Should detect custom out-of-stock string"
-    
-    # Test custom in-stock strings by changing the content
-    test_return_data_instock = """<html>
-       <body>
-       Some initial text<br>
-       <p>Which is across multiple lines</p>
-       <br>
-       So let's see what happens.  <br>
-       <div>price: $10.99</div>
-       <div id="custom">Disponible ahora</div>
-       </body>
-       </html>
-    """
-    
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write(test_return_data_instock)
-    
-    # Update the watch to include custom in-stock strings
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
-        data={
-            "url": test_url,
-            'processor': 'restock_diff',
-            'restock_settings-custom_outofstock_strings': 'Pronto estarán en stock!\nCustom unavailable message',
-            'restock_settings-custom_instock_strings': 'Disponible ahora\nIn voorraad'
-        },
-        follow_redirects=True
-    )
-    assert b"Updated watch." in res.data
-    
-    # Check again - should be detected as in stock now
-    client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
-    wait_for_all_checks(client)
-    res = client.get(url_for("watchlist.index"))
-    assert b'not-in-stock' not in res.data, "Should detect custom in-stock string and show as available"
-
-
-def test_restock_custom_strings_normalization(client, live_server):
-    """Test key normalization scenarios: accents, case, and spaces"""
-    
-    # Test page with Spanish text with accents and mixed case
-    test_return_data = """<html>
-       <body>
-       <div>price: $10.99</div>
-       <div id="status">¡TEMPORALMENTE    AGOTADO!</div>
-       </body>
-       </html>
-    """
-    
-    with open("test-datastore/endpoint-content.txt", "w") as f:
-        f.write(test_return_data)
-    
-    test_url = url_for('test_endpoint', _external=True).replace('http://localhost', 'http://changedet')
-    
-    # Add watch
-    res = client.post(
-        url_for("ui.ui_views.form_quick_watch_add"),
-        data={"url": test_url, "tags": '', 'processor': 'restock_diff'},
-        follow_redirects=True
-    )
-    
-    uuid = extract_UUID_from_client(client)
-    
-    # Configure custom string without accents, lowercase, no extra spaces
-    res = client.post(
-        url_for("ui.ui_edit.edit_page", uuid=uuid, unpause_on_save=1),
-        data={
-            "url": test_url,
-            'processor': 'restock_diff',
-            'restock_settings-custom_outofstock_strings': 'temporalmente agotado'
-        },
-        follow_redirects=True
-    )
-    
-    # Should detect as out of stock despite text differences
-    wait_for_all_checks(client)
-    res = client.get(url_for("watchlist.index"))
-    assert b'not-in-stock' in res.data, "Should match despite accents, case, and spacing differences"
-
--- a/changedetectionio/tests/test_extract_csv.py
+++ b/changedetectionio/tests/test_extract_csv.py
@@ -46,7 +46,7 @@ def test_check_extract_text_from_diff(client, live_server, measure_memory_usage)
        follow_redirects=False
    )

-    assert b'Nothing matches that RegEx' not in res.data
+    assert b'No matches found while scanning all of the watch history for that RegEx.' not in res.data
    assert res.content_type == 'text/csv'

    # Read the csv reply as stringio
--- a/changedetectionio/tests/unit/test_custom_string_normalization.py
+++ b/changedetectionio/tests/unit/test_custom_string_normalization.py
@@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-
-import unittest
-from changedetectionio.processors.restock_diff.processor import perform_site_check
-
-
-class TestCustomStringNormalization(unittest.TestCase):
-    """Test the text normalization logic for custom out-of-stock strings"""
-    
-    def setUp(self):
-        # Create a processor instance for testing
-        self.processor = perform_site_check(datastore=None, watch_uuid='test')
-    
-    def test_normalize_text_for_matching(self):
-        """Test the _normalize_text_for_matching method"""
-        
-        test_cases = [
-            # (input, expected_output)
-            ("Agotado", "agotado"),
-            ("AGOTADO", "agotado"),  # Lowercase
-            ("Sin   stock!", "sin stock!"),  # Normalize whitespace
-            ("Pronto\t\nestarán\nen stock", "pronto estaran en stock"),  # Multiple whitespace types + accents
-            ("¡Temporalmente  AGOTADO!", "¡temporalmente agotado!"),  # Complex case
-            ("", ""),  # Empty string
-            ("café", "cafe"),  # French accent
-            ("naïve", "naive"),  # Multiple accents
-        ]
-        
-        for input_text, expected in test_cases:
-            with self.subTest(input_text=input_text):
-                result = self.processor._normalize_text_for_matching(input_text)
-                self.assertEqual(result, expected, 
-                    f"Failed to normalize '{input_text}' -> expected '{expected}', got '{result}'")
-    
-    def test_check_custom_strings_normalization(self):
-        """Test that custom string matching works with normalization"""
-        
-        test_cases = [
-            # (page_text, custom_strings, should_match, description)
-            ("AGOTADO", "agotado", True, "uppercase to lowercase"),
-            ("Agotado", "agotado", True, "single uppercase to lowercase"),
-            ("Sin   stock!", "sin stock", True, "multiple spaces normalized"),
-            ("¡Pronto    estarán   en stock!", "pronto estaran en stock", True, "accents + spaces"),
-            ("TEMPORALMENTE AGOTADO", "temporalmente agotado", True, "multi-word uppercase"),
-            ("Available now", "agotado", False, "no match case"),
-            ("", "agotado", False, "empty text"),
-            ("agotado", "", False, "empty custom strings"),
-        ]
-        
-        for page_text, custom_strings, should_match, description in test_cases:
-            with self.subTest(description=description):
-                result = self.processor._check_custom_strings(page_text, custom_strings, "out-of-stock")
-                
-                if should_match:
-                    self.assertIsNotNone(result, 
-                        f"Expected match for '{description}': '{page_text}' should match '{custom_strings}'")
-                else:
-                    self.assertIsNone(result, 
-                        f"Expected no match for '{description}': '{page_text}' should not match '{custom_strings}'")
-    
-    def test_check_custom_strings_multiline(self):
-        """Test that multi-line custom strings work properly"""
-        
-        page_text = "Product status: TEMPORALMENTE AGOTADO"
-        custom_strings = """
-        sin stock
-        agotado
-        temporalmente agotado
-        """
-        
-        result = self.processor._check_custom_strings(page_text, custom_strings, "out-of-stock")
-        self.assertIsNotNone(result)
-        self.assertEqual(result.strip(), "temporalmente agotado")
-    
-    def test_get_combined_instock_strings_normalization(self):
-        """Test that custom in-stock strings are normalized properly"""
-        
-        restock_settings = {
-            'custom_instock_strings': 'Disponible AHORA\nEn Stock\nDISPONÍBLE'
-        }
-        
-        result = self.processor._get_combined_instock_strings(restock_settings)
-        
-        # Check that built-in strings are included
-        self.assertIn('instock', result)
-        self.assertIn('presale', result)
-        
-        # Check that custom strings are normalized and included
-        self.assertIn('disponible ahora', result)
-        self.assertIn('en stock', result)
-        self.assertIn('disponible', result)  # accent removed
-
-
-if __name__ == '__main__':
-    unittest.main()