From 60d74188c3f2a8b88f8358ca7c261fb65cf5beb7 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 25 Sep 2020 19:18:47 +0200 Subject: [PATCH] Enhance cleanup of fetched texts. Remove all HTML-elements with a class "read-more" or a class matching "read-more-*". This will remove the "Read More". --- feediverse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 2b94e03..1c43d33 100755 --- a/feediverse.py +++ b/feediverse.py @@ -175,7 +175,11 @@ def collect_images(entry, generator=None): def get_entry(entry, include_images, generator=None): def cleanup(text): - text = BeautifulSoup(text, 'html.parser').get_text() + html = BeautifulSoup(text, 'html.parser') + # Remove all elements of class read-more or read-more-* + for more in html.find_all(None, re.compile("^read-more($|-.*)")): + more.extract() + text = html.get_text() text = re.sub('\xa0+', ' ', text) text = re.sub(' +', ' ', text) text = re.sub(' +\n', '\n', text)