Remove HTML tags from content.

Do this as early as processing the entry so later steps can count on it (esp. when counting characters) Also add a new requirement: beautifulsoup4.
2019-03-29 22:54:30 +01:00
parent fc56be6d70
commit 8886fd5d2d
2 changed files with 10 additions and 3 deletions
--- a/feediverse.py
+++ b/feediverse.py
@ -6,10 +6,12 @@ import argparse
 import yaml
 import dateutil
 import feedparser
+from bs4 import BeautifulSoup

 from mastodon import Mastodon
 from datetime import datetime, timezone

+
 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")

 def main():
@ -74,10 +76,11 @@ def get_entry(entry):
    for tag in entry.get('tags', []):
        for t in tag['term'].split(' '):
            hashtags.append('#{}'.format(t))
+    summary = entry.get('summary', '')
    return {
        'url': entry.id,
-        'title': entry.title,
-        'summary': entry.get('summary', ''),
+        'title': BeautifulSoup(entry.title, 'html.parser').get_text(),
+        'summary': BeautifulSoup(summary, 'html.parser').get_text(),
        'hashtags': ' '.join(hashtags),
        'updated': dateutil.parser.parse(entry['updated']),
    }
--- a/setup.py
+++ b/setup.py
@ -14,6 +14,10 @@ setup(
    description='Connect an RSS Feed to Mastodon',
    long_description=long_description,
    long_description_content_type="text/markdown",
-    install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'],
+    install_requires=['beautifulsoup4',
+                      'feedparser',
+                      'mastodon.py',
+                      'python-dateutil',
+                      'pyyaml'],
    entry_points={'console_scripts': ['feediverse = feediverse:main']}
 )