Enhance cleanup of fetched texts.

Remove all HTML-elements with a class "read-more" or a class matching
"read-more-*". This will remove the "Read More".
This commit is contained in:
Hartmut Goebel
2020-09-25 19:18:47 +02:00
parent 5945a9f9cb
commit 60d74188c3

View File

@ -175,7 +175,11 @@ def collect_images(entry, generator=None):
def get_entry(entry, include_images, generator=None):
def cleanup(text):
text = BeautifulSoup(text, 'html.parser').get_text()
html = BeautifulSoup(text, 'html.parser')
# Remove all elements of class read-more or read-more-*
for more in html.find_all(None, re.compile("^read-more($|-.*)")):
more.extract()
text = html.get_text()
text = re.sub('\xa0+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub(' +\n', '\n', text)