mirror of
https://github.com/dgtlmoon/changedetection.io.git
synced 2025-10-31 22:57:18 +00:00
Compare commits
2 Commits
reset-prev
...
memory-lea
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
59fa8db18f | ||
|
|
09685c62ab |
@@ -4,8 +4,6 @@ from typing import List
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from jsonpath_ng.ext import parse
|
from jsonpath_ng.ext import parse
|
||||||
import re
|
import re
|
||||||
from inscriptis import get_text
|
|
||||||
from inscriptis.model.config import ParserConfig
|
|
||||||
|
|
||||||
class FilterNotFoundInResponse(ValueError):
|
class FilterNotFoundInResponse(ValueError):
|
||||||
def __init__(self, msg):
|
def __init__(self, msg):
|
||||||
@@ -190,9 +188,16 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
|||||||
|
|
||||||
|
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
from inscriptis.model.config import ParserConfig
|
||||||
|
|
||||||
"""Converts html string to a string with just the text. If ignoring
|
"""Converts html string to a string with just the text. If ignoring
|
||||||
rendering anchor tag content is enable, anchor tag content are also
|
rendering anchor tag content is enable, anchor tag content are also
|
||||||
included in the text
|
included in the text
|
||||||
|
|
||||||
|
@NOTE: HORRIBLE LXML INDUCED MEMORY LEAK WORKAROUND HERE
|
||||||
|
https://www.reddit.com/r/Python/comments/j0gl8t/psa_pythonlxml_memory_leaks_and_a_solution/
|
||||||
|
|
||||||
:param html_content: string with html content
|
:param html_content: string with html content
|
||||||
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
||||||
@@ -214,8 +219,19 @@ def html_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
|||||||
else:
|
else:
|
||||||
parser_config = None
|
parser_config = None
|
||||||
|
|
||||||
# get text and annotations via inscriptis
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
def parse_function(html_content, parser_config, results_queue):
|
||||||
|
from inscriptis import get_text
|
||||||
|
# get text and annotations via inscriptis
|
||||||
|
text_content = get_text(html_content, config=parser_config)
|
||||||
|
results_queue.put(text_content)
|
||||||
|
|
||||||
|
results_queue = multiprocessing.Queue()
|
||||||
|
parse_process = multiprocessing.Process(target=parse_function, args=(html_content, parser_config, results_queue))
|
||||||
|
parse_process.daemon = True
|
||||||
|
parse_process.start()
|
||||||
|
text_content = results_queue.get() # blocks until results are available
|
||||||
|
parse_process.terminate()
|
||||||
|
|
||||||
return text_content
|
return text_content
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user