CSS selector filter (#73)

* Re #9 CSS Selector filtering,  Adding test for #9
This commit is contained in:
dgtlmoon
2021-05-30 21:22:26 +10:00
committed by GitHub
parent 1a0c3f1250
commit 2346b42ef2
6 changed files with 144 additions and 7 deletions

View File

@@ -66,25 +66,36 @@ class perform_site_check():
timeout=timeout,
verify=False)
stripped_text_from_html = get_text(r.text)
# CSS Filter
css_filter = self.datastore.data['watching'][uuid]['css_filter']
if css_filter and len(css_filter.strip()):
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.content, "html.parser")
stripped_text_from_html = ""
for item in soup.select(css_filter):
text = str(item.get_text())+"\n"
stripped_text_from_html += text
else:
stripped_text_from_html = get_text(r.text)
# Usually from networkIO/requests level
except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
update_obj["last_error"] = str(e)
print(str(e))
except requests.exceptions.MissingSchema:
print("Skipping {} due to missing schema/bad url".format(uuid))
# Usually from html2text level
except UnicodeDecodeError as e:
except Exception as e:
# except UnicodeDecodeError as e:
update_obj["last_error"] = str(e)
print(str(e))
# figure out how to deal with this cleaner..
# 'utf-8' codec can't decode byte 0xe9 in position 480: invalid continuation byte
else:
# We rely on the actual text in the html output.. many sites have random script vars etc,
# in the future we'll implement other mechanisms.