From 60d74188c3f2a8b88f8358ca7c261fb65cf5beb7 Mon Sep 17 00:00:00 2001
From: Hartmut Goebel <h.goebel@crazy-compilers.com>
Date: Fri, 25 Sep 2020 19:18:47 +0200
Subject: [PATCH] Enhance cleanup of fetched texts.

Remove all HTML-elements with a class "read-more" or a class matching
"read-more-*". This will remove the "Read More".
---
 feediverse.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/feediverse.py b/feediverse.py
index 2b94e03..1c43d33 100755
--- a/feediverse.py
+++ b/feediverse.py
@@ -175,7 +175,11 @@ def collect_images(entry, generator=None):
 def get_entry(entry, include_images, generator=None):
 
     def cleanup(text):
-        text = BeautifulSoup(text, 'html.parser').get_text()
+        html = BeautifulSoup(text, 'html.parser')
+        # Remove all elements of class read-more or read-more-*
+        for more in html.find_all(None, re.compile("^read-more($|-.*)")):
+            more.extract()
+        text = html.get_text()
         text = re.sub('\xa0+', ' ', text)
         text = re.sub('  +', ' ', text)
         text = re.sub(' +\n', '\n', text)