Fix flag handling

2026-07-08 08:21:14 +00:00 · 2022-07-26 16:43:29 +02:00
parent ced1c66e4d
commit 2e451e1f8a
2 changed files with 35 additions and 11 deletions
@@ -11,6 +11,7 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


 # Some common stuff here that can be moved to a base class
+# (set_proxy_from_list)
 class perform_site_check():

    def __init__(self, *args, datastore, **kwargs):
@@ -45,6 +46,20 @@ class perform_site_check():

        return proxy_args

+    # Doesn't look like python supports forward slash auto enclosure in re.findall
+    # So convert it to inline flag "foobar(?i)" type configuration
+    def forward_slash_enclosed_regex_to_options(self, regex):
+        res = re.search(r'^/(.*?)/(\w+)$', regex, re.IGNORECASE)
+
+        if res:
+            regex = res.group(1)
+            regex += '(?{})'.format(res.group(2))
+        else:
+            regex += '(?{})'.format('i')
+
+        return regex
+
+
    def run(self, uuid):
        timestamp = int(time.time())  # used for storage etc too

@@ -215,15 +230,17 @@ class perform_site_check():
        if len(extract_text) > 0:
            regex_matched_output = []
            for s_re in extract_text:
-                result = re.findall(s_re.encode('utf8'), stripped_text_from_html, flags=re.DOTALL)
-                if result:
-                    for l in result:
-                        if type(l) is tuple:
-                            #@todo - some formatter option default (between groups)
-                            regex_matched_output += list(l) + [b'\n']
-                        else:
-                            # @todo - some formatter option default (between each ungrouped result)
-                            regex_matched_output += [l] + [b'\n']
+                # incase they specified something in '/.../x'
+                regex = self.forward_slash_enclosed_regex_to_options(s_re)
+                result = re.findall(regex.encode('utf-8'), stripped_text_from_html)
+
+                for l in result:
+                    if type(l) is tuple:
+                        #@todo - some formatter option default (between groups)
+                        regex_matched_output += list(l) + [b'\n']
+                    else:
+                        # @todo - some formatter option default (between each ungrouped result)
+                        regex_matched_output += [l] + [b'\n']

            # Now we will only show what the regex matched
            stripped_text_from_html = b''
@@ -239,8 +239,15 @@ Unavailable") }}
                        {{ render_field(form.extract_text, rows=5, placeholder="\d+ online") }}
                        <span class="pure-form-message-inline">
                    <ul>
-                        <li>Extracts text in the final output after other filters using regular expressions, for example <code>\d+ online</code></li>
-                        <li>One line per regular-expression.</li>
+                        <li>Extracts text in the final output (line by line) after other filters using regular expressions;
+                            <ul>
+                                <li>Regular expression &dash; example <code>/reports.+?2022/i</code></li>
+                                <li>Use <code>//(?aiLmsux))</code> type flags (more <a href="https://docs.python.org/3/library/re.html#index-15">information here</a>)<br/></li>
+                                <li>Keyword example &dash; example <code>Out of stock</code></li>
+                                <li>Use groups to extract just that text &dash; example <code>/reports.+?(\d+)/i</code> returns a list of years only</li>
+                            </ul>
+                        </li>
+                        <li>One line per regular-expression/ string match</li>
                    </ul>
                        </span>
                    </div>