mirror of
				https://github.com/dgtlmoon/changedetection.io.git
				synced 2025-11-04 08:34:57 +00:00 
			
		
		
		
	Compare commits
	
		
			6 Commits
		
	
	
		
			0.45.7.2
			...
			refactor/r
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					114dab23e9 | ||
| 
						 | 
					96ff5dbeeb | ||
| 
						 | 
					8898f1ba01 | ||
| 
						 | 
					b069c2d04a | ||
| 
						 | 
					2e451e1f8a | ||
| 
						 | 
					ced1c66e4d | 
@@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Some common stuff here that can be moved to a base class
 | 
					# Some common stuff here that can be moved to a base class
 | 
				
			||||||
 | 
					# (set_proxy_from_list)
 | 
				
			||||||
class perform_site_check():
 | 
					class perform_site_check():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, datastore, **kwargs):
 | 
					    def __init__(self, *args, datastore, **kwargs):
 | 
				
			||||||
@@ -45,6 +46,20 @@ class perform_site_check():
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        return proxy_args
 | 
					        return proxy_args
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Doesn't look like python supports forward slash auto enclosure in re.findall
 | 
				
			||||||
 | 
					    # So convert it to inline flag "foobar(?i)" type configuration
 | 
				
			||||||
 | 
					    def forward_slash_enclosed_regex_to_options(self, regex):
 | 
				
			||||||
 | 
					        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if res:
 | 
				
			||||||
 | 
					            regex = res.group(1)
 | 
				
			||||||
 | 
					            regex += '(?{})'.format(res.group(2))
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            regex += '(?{})'.format('i')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return regex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run(self, uuid):
 | 
					    def run(self, uuid):
 | 
				
			||||||
        timestamp = int(time.time())  # used for storage etc too
 | 
					        timestamp = int(time.time())  # used for storage etc too
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -215,15 +230,27 @@ class perform_site_check():
 | 
				
			|||||||
        if len(extract_text) > 0:
 | 
					        if len(extract_text) > 0:
 | 
				
			||||||
            regex_matched_output = []
 | 
					            regex_matched_output = []
 | 
				
			||||||
            for s_re in extract_text:
 | 
					            for s_re in extract_text:
 | 
				
			||||||
                result = re.findall(s_re.encode('utf8'), stripped_text_from_html,
 | 
					                # incase they specified something in '/.../x'
 | 
				
			||||||
                                    flags=re.MULTILINE | re.DOTALL | re.LOCALE)
 | 
					                regex = self.forward_slash_enclosed_regex_to_options(s_re)
 | 
				
			||||||
                if result:
 | 
					                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
 | 
				
			||||||
                    regex_matched_output = regex_matched_output + result
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                for l in result:
 | 
				
			||||||
 | 
					                    if type(l) is tuple:
 | 
				
			||||||
 | 
					                        #@todo - some formatter option default (between groups)
 | 
				
			||||||
 | 
					                        regex_matched_output += list(l) + [b'\n']
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        # @todo - some formatter option default (between each ungrouped result)
 | 
				
			||||||
 | 
					                        regex_matched_output += [l] + [b'\n']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Now we will only show what the regex matched
 | 
				
			||||||
 | 
					            stripped_text_from_html = b''
 | 
				
			||||||
 | 
					            text_content_before_ignored_filter = b''
 | 
				
			||||||
            if regex_matched_output:
 | 
					            if regex_matched_output:
 | 
				
			||||||
                stripped_text_from_html = b'\n'.join(regex_matched_output)
 | 
					                # @todo some formatter for presentation?
 | 
				
			||||||
 | 
					                stripped_text_from_html = b''.join(regex_matched_output)
 | 
				
			||||||
                text_content_before_ignored_filter = stripped_text_from_html
 | 
					                text_content_before_ignored_filter = stripped_text_from_html
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
 | 
					        # Re #133 - if we should strip whitespaces from triggering the change detected comparison
 | 
				
			||||||
        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
 | 
					        if self.datastore.data['settings']['application'].get('ignore_whitespace', False):
 | 
				
			||||||
            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
 | 
					            fetched_md5 = hashlib.md5(stripped_text_from_html.translate(None, b'\r\n\t ')).hexdigest()
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -239,8 +239,15 @@ Unavailable") }}
 | 
				
			|||||||
                        {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
 | 
					                        {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
 | 
				
			||||||
                        <span class="pure-form-message-inline">
 | 
					                        <span class="pure-form-message-inline">
 | 
				
			||||||
                    <ul>
 | 
					                    <ul>
 | 
				
			||||||
                        <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
 | 
					                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
 | 
				
			||||||
                        <li>One line per regular-expression.</li>
 | 
					                            <ul>
 | 
				
			||||||
 | 
					                                <li>Regular expression ‐ example <code>/reports.+?2022/i</code></li>
 | 
				
			||||||
 | 
					                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
 | 
				
			||||||
 | 
					                                <li>Keyword example ‐ example <code>Out of stock</code></li>
 | 
				
			||||||
 | 
					                                <li>Use groups to extract just that text ‐ example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
 | 
				
			||||||
 | 
					                            </ul>
 | 
				
			||||||
 | 
					                        </li>
 | 
				
			||||||
 | 
					                        <li>One line per regular-expression/ string match</li>
 | 
				
			||||||
                    </ul>
 | 
					                    </ul>
 | 
				
			||||||
                        </span>
 | 
					                        </span>
 | 
				
			||||||
                    </div>
 | 
					                    </div>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -15,7 +15,7 @@ def set_original_response():
 | 
				
			|||||||
     </br>
 | 
					     </br>
 | 
				
			||||||
     So let's see what happens.  </br>
 | 
					     So let's see what happens.  </br>
 | 
				
			||||||
     <div id="sametext">Some text thats the same</div>
 | 
					     <div id="sametext">Some text thats the same</div>
 | 
				
			||||||
     <div id="changetext">Some text that will change</div>
 | 
					     <div class="changetext">Some text that will change</div>     
 | 
				
			||||||
     </body>
 | 
					     </body>
 | 
				
			||||||
     </html>
 | 
					     </html>
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
@@ -33,7 +33,8 @@ def set_modified_response():
 | 
				
			|||||||
     </br>
 | 
					     </br>
 | 
				
			||||||
     So let's see what happens.  </br>
 | 
					     So let's see what happens.  </br>
 | 
				
			||||||
     <div id="sametext">Some text thats the same</div>
 | 
					     <div id="sametext">Some text thats the same</div>
 | 
				
			||||||
     <div id="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/>  2000 online )</div>
 | 
					     <div class="changetext">Some text that did change ( 1000 online <br/> 80 guests<br/>  2000 online )</div>
 | 
				
			||||||
 | 
					     <div class="changetext">SomeCase insensitive 3456</div>
 | 
				
			||||||
     </body>
 | 
					     </body>
 | 
				
			||||||
     </html>
 | 
					     </html>
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
@@ -44,11 +45,78 @@ def set_modified_response():
 | 
				
			|||||||
    return None
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_check_filter_and_regex_extract(client, live_server):
 | 
					def set_multiline_response():
 | 
				
			||||||
    sleep_time_for_fetch_thread = 3
 | 
					    test_return_data = """<html>
 | 
				
			||||||
 | 
					       <body>
 | 
				
			||||||
 | 
					     
 | 
				
			||||||
 | 
					     <p>Something <br/>
 | 
				
			||||||
 | 
					        across 6 billion multiple<br/>
 | 
				
			||||||
 | 
					        lines
 | 
				
			||||||
 | 
					     </p>
 | 
				
			||||||
 | 
					     
 | 
				
			||||||
 | 
					     <div>aaand something lines</div>
 | 
				
			||||||
 | 
					     </body>
 | 
				
			||||||
 | 
					     </html>
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with open("test-datastore/endpoint-content.txt", "w") as f:
 | 
				
			||||||
 | 
					        f.write(test_return_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_setup(client, live_server):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    live_server_setup(live_server)
 | 
					    live_server_setup(live_server)
 | 
				
			||||||
    css_filter = "#changetext"
 | 
					
 | 
				
			||||||
 | 
					def test_check_filter_multiline(client, live_server):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    set_multiline_response()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Add our URL to the import page
 | 
				
			||||||
 | 
					    test_url = url_for('test_endpoint', _external=True)
 | 
				
			||||||
 | 
					    res = client.post(
 | 
				
			||||||
 | 
					        url_for("import_page"),
 | 
				
			||||||
 | 
					        data={"urls": test_url},
 | 
				
			||||||
 | 
					        follow_redirects=True
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert b"1 Imported" in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    time.sleep(3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Goto the edit page, add our ignore text
 | 
				
			||||||
 | 
					    # Add our URL to the import page
 | 
				
			||||||
 | 
					    res = client.post(
 | 
				
			||||||
 | 
					        url_for("edit_page", uuid="first"),
 | 
				
			||||||
 | 
					        data={"css_filter": '',
 | 
				
			||||||
 | 
					              'extract_text': '/something.+?6 billion.+?lines/si',
 | 
				
			||||||
 | 
					              "url": test_url,
 | 
				
			||||||
 | 
					              "tag": "",
 | 
				
			||||||
 | 
					              "headers": "",
 | 
				
			||||||
 | 
					              'fetch_backend': "html_requests"
 | 
				
			||||||
 | 
					              },
 | 
				
			||||||
 | 
					        follow_redirects=True
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert b"Updated watch." in res.data
 | 
				
			||||||
 | 
					    time.sleep(3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    res = client.get(
 | 
				
			||||||
 | 
					        url_for("preview_page", uuid="first"),
 | 
				
			||||||
 | 
					        follow_redirects=True
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert b'<div class="">Something' in res.data
 | 
				
			||||||
 | 
					    assert b'<div class="">across 6 billion multiple' in res.data
 | 
				
			||||||
 | 
					    assert b'<div class="">lines' in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # but the last one, which also says 'lines' shouldnt be here (non-greedy match checking)
 | 
				
			||||||
 | 
					    assert b'aaand something lines' not in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_check_filter_and_regex_extract(client, live_server):
 | 
				
			||||||
 | 
					    sleep_time_for_fetch_thread = 3
 | 
				
			||||||
 | 
					    css_filter = ".changetext"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    set_original_response()
 | 
					    set_original_response()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -64,6 +132,7 @@ def test_check_filter_and_regex_extract(client, live_server):
 | 
				
			|||||||
    )
 | 
					    )
 | 
				
			||||||
    assert b"1 Imported" in res.data
 | 
					    assert b"1 Imported" in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    time.sleep(1)
 | 
				
			||||||
    # Trigger a check
 | 
					    # Trigger a check
 | 
				
			||||||
    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
					    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -75,7 +144,7 @@ def test_check_filter_and_regex_extract(client, live_server):
 | 
				
			|||||||
    res = client.post(
 | 
					    res = client.post(
 | 
				
			||||||
        url_for("edit_page", uuid="first"),
 | 
					        url_for("edit_page", uuid="first"),
 | 
				
			||||||
        data={"css_filter": css_filter,
 | 
					        data={"css_filter": css_filter,
 | 
				
			||||||
              'extract_text': '\d+ online\n\d+ guests',
 | 
					              'extract_text': '\d+ online\r\n\d+ guests\r\n/somecase insensitive \d+/i\r\n/somecase insensitive (345\d)/i',
 | 
				
			||||||
              "url": test_url,
 | 
					              "url": test_url,
 | 
				
			||||||
              "tag": "",
 | 
					              "tag": "",
 | 
				
			||||||
              "headers": "",
 | 
					              "headers": "",
 | 
				
			||||||
@@ -86,15 +155,6 @@ def test_check_filter_and_regex_extract(client, live_server):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    assert b"Updated watch." in res.data
 | 
					    assert b"Updated watch." in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Check it saved
 | 
					 | 
				
			||||||
    res = client.get(
 | 
					 | 
				
			||||||
        url_for("edit_page", uuid="first"),
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    assert b'\d+ online' in res.data
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Trigger a check
 | 
					 | 
				
			||||||
#    client.get(url_for("form_watch_checknow"), follow_redirects=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Give the thread time to pick it up
 | 
					    # Give the thread time to pick it up
 | 
				
			||||||
    time.sleep(sleep_time_for_fetch_thread)
 | 
					    time.sleep(sleep_time_for_fetch_thread)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -126,5 +186,13 @@ def test_check_filter_and_regex_extract(client, live_server):
 | 
				
			|||||||
    # Both regexs should be here
 | 
					    # Both regexs should be here
 | 
				
			||||||
    assert b'<div class="">80 guests' in res.data
 | 
					    assert b'<div class="">80 guests' in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Regex with flag handling should be here
 | 
				
			||||||
 | 
					    assert b'<div class="">SomeCase insensitive 3456' in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Singular group from /somecase insensitive (345\d)/i
 | 
				
			||||||
 | 
					    assert b'<div class="">3456' in res.data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Regex with multiline flag handling should be here
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Should not be here
 | 
					    # Should not be here
 | 
				
			||||||
    assert b'Some text that did change' not in res.data
 | 
					    assert b'Some text that did change' not in res.data
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -113,7 +113,6 @@ class update_worker(threading.Thread):
 | 
				
			|||||||
                        err_text = "Page request from server didnt respond correctly"
 | 
					                        err_text = "Page request from server didnt respond correctly"
 | 
				
			||||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
					                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
 | 
				
			||||||
                                                                           'last_check_status': e.status_code})
 | 
					                                                                           'last_check_status': e.status_code})
 | 
				
			||||||
 | 
					 | 
				
			||||||
                    except Exception as e:
 | 
					                    except Exception as e:
 | 
				
			||||||
                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
 | 
					                        self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
 | 
				
			||||||
                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
 | 
					                        self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user