From 493c1ad3f3859f5a9e594205dd9596cd2abc6c9e Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 29 Mar 2019 20:58:19 +0100 Subject: [PATCH 01/31] Add minimal argparse support to get support for `--help`. Closes #5. --- feediverse.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/feediverse.py b/feediverse.py index 5a51207..b9645b9 100755 --- a/feediverse.py +++ b/feediverse.py @@ -2,6 +2,7 @@ import os import sys +import argparse import yaml import dateutil import feedparser @@ -9,8 +10,17 @@ import feedparser from mastodon import Mastodon from datetime import datetime, timezone +DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") + def main(): - config_file = get_config_file() + parser = argparse.ArgumentParser() + parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", + help=("config file to use, default: %s" % + DEFAULT_CONFIG_FILE), + default=os.path.expanduser(DEFAULT_CONFIG_FILE)) + args = parser.parse_args() + config_file = args.config_file + if not os.path.isfile(config_file): setup(config_file) @@ -29,13 +39,6 @@ def main(): save_config(config, config_file) -def get_config_file(): - if __name__ == "__main__" and len(sys.argv) > 1: - config_file = sys.argv[1] - else: - config_file = os.path.join(os.path.expanduser("~"), ".feediverse") - return config_file - def save_config(config, config_file): copy = dict(config) copy['updated'] = datetime.now(tz=timezone.utc).isoformat() From e99c18b249f8a73de0e8da75c3abe6a0c45ef29b Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 29 Mar 2019 21:33:07 +0100 Subject: [PATCH 02/31] Sort entries in reverse published order. In a feed typically the newest entries are on top, while the older ones should be posted first. Thus reverse the order, based on publish date. Closes #4. --- feediverse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/feediverse.py b/feediverse.py index b9645b9..eeb3c1b 100755 --- a/feediverse.py +++ b/feediverse.py @@ -58,6 +58,7 @@ def read_config(config_file): def get_feed(feed_url, last_update): new_entries = 0 feed = feedparser.parse(feed_url) + feed.entries.sort(key=lambda e: e.published_parsed) for entry in feed.entries: e = get_entry(entry) if last_update is None or e['updated'] > last_update: From fc56be6d70b9de99a970ab6cac276aa52b8ea62b Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 29 Mar 2019 22:33:23 +0100 Subject: [PATCH 03/31] Filter entries prior to processing any entry. This saves processing time, esp. since for most installation there should not be so many changes and most times, there will be zero entries to be posted, thus there is not need to process them. --- feediverse.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/feediverse.py b/feediverse.py index eeb3c1b..3226a4f 100755 --- a/feediverse.py +++ b/feediverse.py @@ -58,12 +58,15 @@ def read_config(config_file): def get_feed(feed_url, last_update): new_entries = 0 feed = feedparser.parse(feed_url) - feed.entries.sort(key=lambda e: e.published_parsed) - for entry in feed.entries: - e = get_entry(entry) - if last_update is None or e['updated'] > last_update: - new_entries += 1 - yield e + if last_update: + entries = [e for e in feed.entries + if dateutil.parser.parse(e['updated']) > last_update] + else: + entries = feed.entries + entries.sort(key=lambda e: e.published_parsed) + for entry in entries: + new_entries += 1 + yield get_entry(entry) return new_entries def get_entry(entry): From 8886fd5d2dc56e44397e3c2fcf1dd8326e0f555c Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 29 Mar 2019 22:54:30 +0100 Subject: [PATCH 04/31] Remove HTML tags from content. Do this as early as processing the entry so later steps can count on it (esp. when counting characters) Also add a new requirement: beautifulsoup4. --- feediverse.py | 7 +++++-- setup.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/feediverse.py b/feediverse.py index 3226a4f..0d225e6 100755 --- a/feediverse.py +++ b/feediverse.py @@ -6,10 +6,12 @@ import argparse import yaml import dateutil import feedparser +from bs4 import BeautifulSoup from mastodon import Mastodon from datetime import datetime, timezone + DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") def main(): @@ -74,10 +76,11 @@ def get_entry(entry): for tag in entry.get('tags', []): for t in tag['term'].split(' '): hashtags.append('#{}'.format(t)) + summary = entry.get('summary', '') return { 'url': entry.id, - 'title': entry.title, - 'summary': entry.get('summary', ''), + 'title': BeautifulSoup(entry.title, 'html.parser').get_text(), + 'summary': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), } diff --git a/setup.py b/setup.py index 0f0a766..9cc56a9 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,10 @@ setup( description='Connect an RSS Feed to Mastodon', long_description=long_description, long_description_content_type="text/markdown", - install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'], + install_requires=['beautifulsoup4', + 'feedparser', + 'mastodon.py', + 'python-dateutil', + 'pyyaml'], entry_points={'console_scripts': ['feediverse = feediverse:main']} ) From 0b13bbbabea03e640eb2ac31292d2a87c34d40ec Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 29 Mar 2019 23:03:31 +0100 Subject: [PATCH 05/31] Very small code cleanup. --- feediverse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index 0d225e6..1be532a 100755 --- a/feediverse.py +++ b/feediverse.py @@ -74,8 +74,8 @@ def get_feed(feed_url, last_update): def get_entry(entry): hashtags = [] for tag in entry.get('tags', []): - for t in tag['term'].split(' '): - hashtags.append('#{}'.format(t)) + for t in tag['term'].split(): + hashtags.append('#' + t) summary = entry.get('summary', '') return { 'url': entry.id, From 13d1dd2623dbca56a8147949dbfb2267566f70c0 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 19:51:00 +0100 Subject: [PATCH 06/31] Fix deprecation warning when calling yaml.load(). This was the message: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe --- feediverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 1be532a..a18b7bd 100755 --- a/feediverse.py +++ b/feediverse.py @@ -50,7 +50,7 @@ def save_config(config, config_file): def read_config(config_file): config = {} with open(config_file) as fh: - config = yaml.load(fh) + config = yaml.load(fh, yaml.SafeLoader) if 'updated' in config: config['updated'] = dateutil.parser.parse(config['updated']) else: From 83ed532680f9b33890c9d1c1741bdc9a64fa7753 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 21:17:24 +0100 Subject: [PATCH 07/31] Add retrieving images from RSS & posting them. Collects image urls from summary, content and enclosures (attachments). This add urllib3 as requirement. --- feediverse.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- setup.py | 3 ++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/feediverse.py b/feediverse.py index a18b7bd..54a9f8f 100755 --- a/feediverse.py +++ b/feediverse.py @@ -10,10 +10,13 @@ from bs4 import BeautifulSoup from mastodon import Mastodon from datetime import datetime, timezone +import urllib3 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") +http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) + def main(): parser = argparse.ArgumentParser() parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", @@ -37,8 +40,15 @@ def main(): for feed in config['feeds']: for entry in get_feed(feed['url'], config['updated']): - masto.status_post(feed['template'].format(**entry)[0:49999999999]) - + media_ids = [] + for img in entry.get("images", []): + media = masto.media_post(img, img.headers['content-type']) + img.release_conn() # deferred from collect_images() + if not 'error' in media: + media_ids.append(media) + entry.pop("images", None) + masto.status_post(feed['template'].format(**entry)[:49999999999], + media_ids=media_ids) save_config(config, config_file) def save_config(config, config_file): @@ -71,6 +81,40 @@ def get_feed(feed_url, last_update): yield get_entry(entry) return new_entries +def collect_images(entry): + + def find_urls(part): + if not part: + return + soup = BeautifulSoup(part, 'html.parser') + for tag in soup.find_all(["a", "img"]): + if tag.name == "a": + url = tag["href"] + elif tag.name == "img": + url = tag["src"] + if url not in urls: + urls.append(url) + + urls = [] + find_urls(entry.get("summary", "")) + for c in entry.get("content", []): + find_urls(c.value) + for e in (entry.enclosures + + [l for l in entry.links if l.get("rel") == "enclosure"]): + if (e["type"].startswith(("image/", "video/")) and + e["href"] not in urls): + urls.append(e["href"]) + images = [] + for url in urls: + resp = http.request('GET', url, preload_content=False) + if resp.headers['content-type'].startswith(("image/", "video/")): + images.append(resp) + # IMPORTANT: Need to release_conn() later! + else: + resp.release_conn() + return images + + def get_entry(entry): hashtags = [] for tag in entry.get('tags', []): @@ -83,6 +127,7 @@ def get_entry(entry): 'summary': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), + 'images': collect_images(entry), } def setup(config_file): diff --git a/setup.py b/setup.py index 9cc56a9..3c1bb2a 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ setup( 'feedparser', 'mastodon.py', 'python-dateutil', - 'pyyaml'], + 'pyyaml', + 'urllib3[secure]'], entry_points={'console_scripts': ['feediverse = feediverse:main']} ) From 350f2bca3f525d81dc849fe4c3244b1e53950db2 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 22:01:28 +0100 Subject: [PATCH 08/31] Add detection of feed generator and pass it for get_entry(). This allows generator-specific handling of e.g. url. For example in wordpress `id` is an ugly url, while the speaking permalink is stored in a alternate link --- feediverse.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index 54a9f8f..1e6afac 100755 --- a/feediverse.py +++ b/feediverse.py @@ -67,6 +67,14 @@ def read_config(config_file): config['updated'] = datetime.now(tz=timezone.utc) return config +def detect_generator(feed): + # For RSS the generator tag holds the URL, while for ATOM it holds the name + if "/wordpress.org/" in feed.feed.generator: + return "wordpress" + elif "wordpress" == feed.feed.generator.lower(): + return "wordpress" + return None + def get_feed(feed_url, last_update): new_entries = 0 feed = feedparser.parse(feed_url) @@ -76,9 +84,10 @@ def get_feed(feed_url, last_update): else: entries = feed.entries entries.sort(key=lambda e: e.published_parsed) + generator = detect_generator(feed) for entry in entries: new_entries += 1 - yield get_entry(entry) + yield get_entry(entry, generator) return new_entries def collect_images(entry): @@ -115,7 +124,7 @@ def collect_images(entry): return images -def get_entry(entry): +def get_entry(entry, generator=None): hashtags = [] for tag in entry.get('tags', []): for t in tag['term'].split(): From 03d48992c77f646c0027c89ea57b77783708e1dc Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 22:02:48 +0100 Subject: [PATCH 09/31] Add detection of premalink for wordpress-generated feeds. --- feediverse.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 1e6afac..5d9bb79 100755 --- a/feediverse.py +++ b/feediverse.py @@ -130,8 +130,15 @@ def get_entry(entry, generator=None): for t in tag['term'].split(): hashtags.append('#' + t) summary = entry.get('summary', '') + url = entry.id + if generator == "wordpress": + links = [l for l in entry.links if l.get("rel") == "alternate"] + if len(links) > 1: + links = [l for l in entry.links if l.get("type") == "text/html"] + if links: + url = links[0]["href"] return { - 'url': entry.id, + 'url': url, 'title': BeautifulSoup(entry.title, 'html.parser').get_text(), 'summary': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), From ae78c8c16f7ddc98f5c6351d86a06bfb55b7f8a1 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 22:05:50 +0100 Subject: [PATCH 10/31] Make "content" available in the template. Depending on the feed, the adding content could be more appropriate. Leave this choice to the user. --- feediverse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/feediverse.py b/feediverse.py index 5d9bb79..e008d1a 100755 --- a/feediverse.py +++ b/feediverse.py @@ -130,6 +130,7 @@ def get_entry(entry, generator=None): for t in tag['term'].split(): hashtags.append('#' + t) summary = entry.get('summary', '') + content = entry.get('content', '') url = entry.id if generator == "wordpress": links = [l for l in entry.links if l.get("rel") == "alternate"] @@ -141,6 +142,7 @@ def get_entry(entry, generator=None): 'url': url, 'title': BeautifulSoup(entry.title, 'html.parser').get_text(), 'summary': BeautifulSoup(summary, 'html.parser').get_text(), + 'content': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), 'images': collect_images(entry), From d2e57bbc270346c082ee617e8d475fb42b785e76 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 23:34:55 +0100 Subject: [PATCH 11/31] Add a work-around for buggy wordpress (urls encoded wrong). --- feediverse.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index e008d1a..b6f86bd 100755 --- a/feediverse.py +++ b/feediverse.py @@ -2,6 +2,7 @@ import os import sys +import codecs import argparse import yaml import dateutil @@ -17,6 +18,14 @@ DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) +# encoding error-handler for buggy wordpress urls +def __urlencodereplace_errors(exc): + bs = exc.object[exc.start:exc.end].encode("utf-8") + bs = b"".join(b'%%%X' % b for b in bs) + return (bs, exc.end) +codecs.register_error("urlencodereplace", __urlencodereplace_errors) + + def main(): parser = argparse.ArgumentParser() parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", @@ -90,7 +99,7 @@ def get_feed(feed_url, last_update): yield get_entry(entry, generator) return new_entries -def collect_images(entry): +def collect_images(entry, generator=None): def find_urls(part): if not part: @@ -113,6 +122,12 @@ def collect_images(entry): if (e["type"].startswith(("image/", "video/")) and e["href"] not in urls): urls.append(e["href"]) + if generator == "wordpress": + # Work around a wordpress bug: If the filename contains an + # umlaut, this will not be encoded using %-escape, as the + # standard demands. This will break encoding in http.request() + urls = (u.encode("ascii", "urlencodereplace").decode() + for u in urls) images = [] for url in urls: resp = http.request('GET', url, preload_content=False) @@ -145,7 +160,7 @@ def get_entry(entry, generator=None): 'content': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), - 'images': collect_images(entry), + 'images': collect_images(entry, generator), } def setup(config_file): From e6a16dbe55ac0b2a566ab916a7e0a28ce325e47e Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Sat, 30 Mar 2019 23:53:09 +0100 Subject: [PATCH 12/31] For wordpress skip all images provided by a plugin. --- feediverse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/feediverse.py b/feediverse.py index b6f86bd..a8e4819 100755 --- a/feediverse.py +++ b/feediverse.py @@ -123,6 +123,7 @@ def collect_images(entry, generator=None): e["href"] not in urls): urls.append(e["href"]) if generator == "wordpress": + urls = (u for u in urls if not "/wp-content/plugins/" in u) # Work around a wordpress bug: If the filename contains an # umlaut, this will not be encoded using %-escape, as the # standard demands. This will break encoding in http.request() From 2624eed96b20ca71d3f026f38e87b0db66504942 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Thu, 4 Apr 2019 16:43:22 +0200 Subject: [PATCH 13/31] Fix: If last-updated is not given in config, no feeds are pushed. The bug was: If last-updated was not given in the config, the current date and time was used, inhibiting posting "old" entries. Todo: Add an option to ask whether "old" entries shall be posted on first run. --- feediverse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index a8e4819..fff9c1a 100755 --- a/feediverse.py +++ b/feediverse.py @@ -10,7 +10,7 @@ import feedparser from bs4 import BeautifulSoup from mastodon import Mastodon -from datetime import datetime, timezone +from datetime import datetime, timezone, MINYEAR import urllib3 @@ -73,7 +73,8 @@ def read_config(config_file): if 'updated' in config: config['updated'] = dateutil.parser.parse(config['updated']) else: - config['updated'] = datetime.now(tz=timezone.utc) + config['updated'] = datetime(MINYEAR, 1, 1, + 0, 0, 0, 0, timezone.utc) return config def detect_generator(feed): From da5486d004564520296464cfad14d6f3cbc70fa7 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 16 Apr 2019 10:35:38 +0200 Subject: [PATCH 14/31] Fix: Mastodon allows posting 4 images max. --- feediverse.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/feediverse.py b/feediverse.py index fff9c1a..446f439 100755 --- a/feediverse.py +++ b/feediverse.py @@ -15,6 +15,7 @@ import urllib3 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") +MAX_IMAGES = 4 # Mastodon allows attaching 4 images max. http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) @@ -136,6 +137,8 @@ def collect_images(entry, generator=None): if resp.headers['content-type'].startswith(("image/", "video/")): images.append(resp) # IMPORTANT: Need to release_conn() later! + if len(images) >= MAX_IMAGES: + break else: resp.release_conn() return images From b0ba30b5f375954a9ee457abdb321512a5674271 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 16 Apr 2019 12:31:20 +0200 Subject: [PATCH 15/31] Minor code cleanup. Add and use helper function to ask a yes/no question. --- feediverse.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index 446f439..3f0a409 100755 --- a/feediverse.py +++ b/feediverse.py @@ -169,9 +169,14 @@ def get_entry(entry, generator=None): } def setup(config_file): + + def yes_no(question): + res = input(question + ' [y/n] ') + return res.lower() in "y1" + url = input('What is your Mastodon Instance URL? ') - have_app = input('Do you have your app credentials already? [y/n] ') - if have_app.lower() == 'y': + have_app = yes_no('Do you have your app credentials already?') + if have_app: name = 'feediverse' client_id = input('What is your app\'s client id: ') client_secret = input('What is your client secret: ') From e0dde90b7da98999e355b00266bde8b75f01d9b6 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 16 Apr 2019 12:38:43 +0200 Subject: [PATCH 16/31] On setup ask whether existing entries shall be tooted, too. --- feediverse.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/feediverse.py b/feediverse.py index 3f0a409..72a0a32 100755 --- a/feediverse.py +++ b/feediverse.py @@ -61,9 +61,10 @@ def main(): media_ids=media_ids) save_config(config, config_file) -def save_config(config, config_file): +def save_config(config, config_file, toot_old_posts=False): copy = dict(config) - copy['updated'] = datetime.now(tz=timezone.utc).isoformat() + if not toot_old_posts: + copy['updated'] = datetime.now(tz=timezone.utc).isoformat() with open(config_file, 'w') as fh: fh.write(yaml.dump(copy, default_flow_style=False)) @@ -196,6 +197,7 @@ def setup(config_file): access_token = m.log_in(username, password) feed_url = input('RSS/Atom feed URL to watch: ') + old_posts = yes_no('Shall already existing entries be tooted, too?') config = { 'name': name, 'url': url, @@ -206,7 +208,7 @@ def setup(config_file): {'url': feed_url, 'template': '{title} {url}'} ] } - save_config(config, config_file) + save_config(config, config_file, old_posts) print("") print("Your feediverse configuration has been saved to {}".format(config_file)) print("Add a line line this to your crontab to check every 15 minutes:") From 2d45df57f11f7f88f60b7bb5ab631e21ec9fcc1a Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 16 Apr 2019 13:53:00 +0200 Subject: [PATCH 17/31] Minor code cleanup. Preset config values when reading config file. This is to ease introducing new options (like the next commit will do). --- feediverse.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/feediverse.py b/feediverse.py index 72a0a32..47040a8 100755 --- a/feediverse.py +++ b/feediverse.py @@ -69,14 +69,14 @@ def save_config(config, config_file, toot_old_posts=False): fh.write(yaml.dump(copy, default_flow_style=False)) def read_config(config_file): - config = {} + config = { + 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), + } with open(config_file) as fh: - config = yaml.load(fh, yaml.SafeLoader) - if 'updated' in config: - config['updated'] = dateutil.parser.parse(config['updated']) - else: - config['updated'] = datetime(MINYEAR, 1, 1, - 0, 0, 0, 0, timezone.utc) + cfg = yaml.load(fh, yaml.SafeLoader) + if 'updated' in cfg: + cfg['updated'] = dateutil.parser.parse(cfg['updated']) + config.update(cfg) return config def detect_generator(feed): From 0b65eb8e21c376f7114b0f62848a9d91c2360cf1 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 16 Apr 2019 13:57:07 +0200 Subject: [PATCH 18/31] Make adding images into the toot configurable. Add an option "include_images" into the config file. --- feediverse.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/feediverse.py b/feediverse.py index 47040a8..4f83683 100755 --- a/feediverse.py +++ b/feediverse.py @@ -49,7 +49,8 @@ def main(): ) for feed in config['feeds']: - for entry in get_feed(feed['url'], config['updated']): + for entry in get_feed(feed['url'], config['updated'], + config['include_images']): media_ids = [] for img in entry.get("images", []): media = masto.media_post(img, img.headers['content-type']) @@ -71,6 +72,7 @@ def save_config(config, config_file, toot_old_posts=False): def read_config(config_file): config = { 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), + 'include_images': False, } with open(config_file) as fh: cfg = yaml.load(fh, yaml.SafeLoader) @@ -87,7 +89,7 @@ def detect_generator(feed): return "wordpress" return None -def get_feed(feed_url, last_update): +def get_feed(feed_url, last_update, include_images): new_entries = 0 feed = feedparser.parse(feed_url) if last_update: @@ -99,7 +101,7 @@ def get_feed(feed_url, last_update): generator = detect_generator(feed) for entry in entries: new_entries += 1 - yield get_entry(entry, generator) + yield get_entry(entry, include_images, generator) return new_entries def collect_images(entry, generator=None): @@ -145,7 +147,7 @@ def collect_images(entry, generator=None): return images -def get_entry(entry, generator=None): +def get_entry(entry, include_images, generator=None): hashtags = [] for tag in entry.get('tags', []): for t in tag['term'].split(): @@ -166,7 +168,7 @@ def get_entry(entry, generator=None): 'content': BeautifulSoup(summary, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), - 'images': collect_images(entry, generator), + 'images': collect_images(entry, generator) if include_images else [], } def setup(config_file): @@ -198,12 +200,14 @@ def setup(config_file): feed_url = input('RSS/Atom feed URL to watch: ') old_posts = yes_no('Shall already existing entries be tooted, too?') + include_images = yes_no('Shall images be included in the toot?') config = { 'name': name, 'url': url, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, + 'include_images': include_images, 'feeds': [ {'url': feed_url, 'template': '{title} {url}'} ] From 8e51b4344dcc1f2e491d83fca95cfbe6d5090d7f Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 21:56:04 +0200 Subject: [PATCH 19/31] Add command line option -n/--dry-run. --- feediverse.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 4f83683..6eb099a 100755 --- a/feediverse.py +++ b/feediverse.py @@ -29,6 +29,9 @@ codecs.register_error("urlencodereplace", __urlencodereplace_errors) def main(): parser = argparse.ArgumentParser() + parser.add_argument("-n", "--dry-run", action="store_true", + help=("perform a trial run with no changes made: " + "don't toot, don't save config")) parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", help=("config file to use, default: %s" % DEFAULT_CONFIG_FILE), @@ -51,6 +54,9 @@ def main(): for feed in config['feeds']: for entry in get_feed(feed['url'], config['updated'], config['include_images']): + if args.dry_run: + print("trial run, not tooting") + continue media_ids = [] for img in entry.get("images", []): media = masto.media_post(img, img.headers['content-type']) @@ -60,7 +66,11 @@ def main(): entry.pop("images", None) masto.status_post(feed['template'].format(**entry)[:49999999999], media_ids=media_ids) - save_config(config, config_file) + if args.dry_run: + print("trial run, not saving the config") + else: + save_config(config, config_file) + def save_config(config, config_file, toot_old_posts=False): copy = dict(config) From 7a5b30aeef58cef7ab1b9661b18a1a454a13ecef Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 21:59:00 +0200 Subject: [PATCH 20/31] Add option -v/--verbose. --- feediverse.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/feediverse.py b/feediverse.py index 6eb099a..61ebcee 100755 --- a/feediverse.py +++ b/feediverse.py @@ -32,6 +32,8 @@ def main(): parser.add_argument("-n", "--dry-run", action="store_true", help=("perform a trial run with no changes made: " "don't toot, don't save config")) + parser.add_argument("-v", "--verbose", action="store_true", + help="be verbose") parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", help=("config file to use, default: %s" % DEFAULT_CONFIG_FILE), @@ -54,6 +56,8 @@ def main(): for feed in config['feeds']: for entry in get_feed(feed['url'], config['updated'], config['include_images']): + if args.verbose: + print(entry) if args.dry_run: print("trial run, not tooting") continue @@ -69,6 +73,8 @@ def main(): if args.dry_run: print("trial run, not saving the config") else: + if args.verbose: + print("saving the config") save_config(config, config_file) From e41073efbc9ff1cf23ea3310622be7eb3540342d Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 22:05:53 +0200 Subject: [PATCH 21/31] Fix template element '{content}'. This was the same as '{summary}' and needs more attention, too. --- feediverse.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index 61ebcee..afb8764 100755 --- a/feediverse.py +++ b/feediverse.py @@ -169,7 +169,9 @@ def get_entry(entry, include_images, generator=None): for t in tag['term'].split(): hashtags.append('#' + t) summary = entry.get('summary', '') - content = entry.get('content', '') + content = entry.get('content', '') or '' + if content: + content = content[0].get('value', '') url = entry.id if generator == "wordpress": links = [l for l in entry.links if l.get("rel") == "alternate"] @@ -181,7 +183,7 @@ def get_entry(entry, include_images, generator=None): 'url': url, 'title': BeautifulSoup(entry.title, 'html.parser').get_text(), 'summary': BeautifulSoup(summary, 'html.parser').get_text(), - 'content': BeautifulSoup(summary, 'html.parser').get_text(), + 'content': BeautifulSoup(content, 'html.parser').get_text(), 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), 'images': collect_images(entry, generator) if include_images else [], From 09a3588f71b5f93f0d3e0c75b23ae76899a0c7c0 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 22:06:32 +0200 Subject: [PATCH 22/31] Document template element '{content}'. --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 76a74e4..47d9b9a 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ like so: `{hashtags}` will look for tags in the feed entry and turn them into a space separated list of hashtags. +`{content}` is the whole content of the feed entry (with html-tags +stripped). Please be aware that this might easily exceed Mastodon's +limit of 512 characters. + + ## Multiple Feeds Since *feeds* is a list you can add additional feeds to watch if you want. From 9e1a94d4cad8315beadfb1a7ffce7d98307283b8 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 22:37:57 +0200 Subject: [PATCH 23/31] Add cleaning up white-space in fetched texts. --- feediverse.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/feediverse.py b/feediverse.py index afb8764..c390ce4 100755 --- a/feediverse.py +++ b/feediverse.py @@ -12,6 +12,7 @@ from bs4 import BeautifulSoup from mastodon import Mastodon from datetime import datetime, timezone, MINYEAR import urllib3 +import re DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") @@ -164,6 +165,15 @@ def collect_images(entry, generator=None): def get_entry(entry, include_images, generator=None): + + def cleanup(text): + text = BeautifulSoup(text, 'html.parser').get_text() + text = re.sub('\xa0+', ' ', text) + text = re.sub(' +', ' ', text) + text = re.sub(' +\n', '\n', text) + text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) + return text.strip() + hashtags = [] for tag in entry.get('tags', []): for t in tag['term'].split(): @@ -171,7 +181,7 @@ def get_entry(entry, include_images, generator=None): summary = entry.get('summary', '') content = entry.get('content', '') or '' if content: - content = content[0].get('value', '') + content = cleanup(content[0].get('value', '')) url = entry.id if generator == "wordpress": links = [l for l in entry.links if l.get("rel") == "alternate"] @@ -181,9 +191,9 @@ def get_entry(entry, include_images, generator=None): url = links[0]["href"] return { 'url': url, - 'title': BeautifulSoup(entry.title, 'html.parser').get_text(), - 'summary': BeautifulSoup(summary, 'html.parser').get_text(), - 'content': BeautifulSoup(content, 'html.parser').get_text(), + 'title': cleanup(entry.title), + 'summary': cleanup(summary), + 'content': content, 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), 'images': collect_images(entry, generator) if include_images else [], From b57bc48d0d324d4809e1fcffd8e6e5191050c3f4 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Tue, 23 Apr 2019 22:44:50 +0200 Subject: [PATCH 24/31] Update readme. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 47d9b9a..07a5fa4 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,9 @@ Once *feediverse* is configured you can add it to your crontab: */15 * * * * /usr/local/bin/feediverse +Run `feediverse --help` to show the comand line options. + + ## Post Format You can customize the post format by opening the configuration file (default is From 17bba74f22b1a43d4566aae1ff1dd5de985af0e5 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Mon, 16 Sep 2019 14:20:13 +0200 Subject: [PATCH 25/31] Readme: Add "Special Handling for Different Feed Generators". I should have had added this when adding the special support for Wordpress in around e6a16dbe55. --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 07a5fa4..1faf7ee 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,14 @@ Since *feeds* is a list you can add additional feeds to watch if you want. - url: https://example.org/feed/ template: "dot org: {title} {url}" + +## Special Handling for Different Feed Generators + +*feediverse* has support for some special cases of some feed +generators. For example detecting the entries perma-link. Currently +only Wordpress is handled, but others may follow. + + ## Why? I created *feediverse* because I wanted to send my Pinboard bookmarks to From 7df2d306e4488240d09f057131dee9999b09f521 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Mon, 16 Sep 2019 13:49:52 +0200 Subject: [PATCH 26/31] Don't crash if feed does not contain a "generator" element. --- feediverse.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/feediverse.py b/feediverse.py index c390ce4..afbc6a4 100755 --- a/feediverse.py +++ b/feediverse.py @@ -100,9 +100,10 @@ def read_config(config_file): def detect_generator(feed): # For RSS the generator tag holds the URL, while for ATOM it holds the name - if "/wordpress.org/" in feed.feed.generator: + generator = feed.feed.get("generator", "") + if "/wordpress.org/" in generator: return "wordpress" - elif "wordpress" == feed.feed.generator.lower(): + elif "wordpress" == generator.lower(): return "wordpress" return None From 52cf05c09c4cfd8ddb43d71645ee144bebcd729f Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Mon, 16 Sep 2019 14:28:00 +0200 Subject: [PATCH 27/31] Add feed config option `generator`. This allows setting or overwriting the generator provided by the feed. --- README.md | 9 +++++++++ feediverse.py | 8 +++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1faf7ee..084dc8a 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Since *feeds* is a list you can add additional feeds to watch if you want. template: "dot com: {title} {url}" - url: https://example.org/feed/ template: "dot org: {title} {url}" + generator: wordpress ## Special Handling for Different Feed Generators @@ -60,6 +61,14 @@ Since *feeds* is a list you can add additional feeds to watch if you want. generators. For example detecting the entries perma-link. Currently only Wordpress is handled, but others may follow. +If a feed does not provide a proper *generator* entry, you can set it +by adding a `generator:` value to the feed's configuration. See the +seconds one in the example above. + +You can check whether feed provides a *generator* entry like this: + + feediverse --verbose --dry-run feedverse-test.rc | grep generator + ## Why? diff --git a/feediverse.py b/feediverse.py index afbc6a4..1e7ea2b 100755 --- a/feediverse.py +++ b/feediverse.py @@ -56,7 +56,8 @@ def main(): for feed in config['feeds']: for entry in get_feed(feed['url'], config['updated'], - config['include_images']): + config['include_images'], + generator=feed.get('generator')): if args.verbose: print(entry) if args.dry_run: @@ -107,7 +108,7 @@ def detect_generator(feed): return "wordpress" return None -def get_feed(feed_url, last_update, include_images): +def get_feed(feed_url, last_update, include_images, generator=None): new_entries = 0 feed = feedparser.parse(feed_url) if last_update: @@ -116,7 +117,7 @@ def get_feed(feed_url, last_update, include_images): else: entries = feed.entries entries.sort(key=lambda e: e.published_parsed) - generator = detect_generator(feed) + generator = generator or detect_generator(feed) for entry in entries: new_entries += 1 yield get_entry(entry, include_images, generator) @@ -198,6 +199,7 @@ def get_entry(entry, include_images, generator=None): 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), 'images': collect_images(entry, generator) if include_images else [], + '__generator__': generator, } def setup(config_file): From 5945a9f9cb1860deb174c65d11772662e65ba531 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Mon, 16 Sep 2019 14:49:08 +0200 Subject: [PATCH 28/31] Add work-around for verbose-mode on non-unicode terminals. --- feediverse.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 1e7ea2b..2b94e03 100755 --- a/feediverse.py +++ b/feediverse.py @@ -59,7 +59,13 @@ def main(): config['include_images'], generator=feed.get('generator')): if args.verbose: - print(entry) + try: + print(entry) + except UnicodeEncodeError: + # work-around for non-unicode terminals + print(dict( + (k, v.encode("utf-8") if hasattr(v, "encode") else v) + for k, v in entry.items())) if args.dry_run: print("trial run, not tooting") continue From 60d74188c3f2a8b88f8358ca7c261fb65cf5beb7 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 25 Sep 2020 19:18:47 +0200 Subject: [PATCH 29/31] Enhance cleanup of fetched texts. Remove all HTML-elements with a class "read-more" or a class matching "read-more-*". This will remove the "Read More". --- feediverse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 2b94e03..1c43d33 100755 --- a/feediverse.py +++ b/feediverse.py @@ -175,7 +175,11 @@ def collect_images(entry, generator=None): def get_entry(entry, include_images, generator=None): def cleanup(text): - text = BeautifulSoup(text, 'html.parser').get_text() + html = BeautifulSoup(text, 'html.parser') + # Remove all elements of class read-more or read-more-* + for more in html.find_all(None, re.compile("^read-more($|-.*)")): + more.extract() + text = html.get_text() text = re.sub('\xa0+', ' ', text) text = re.sub(' +', ' ', text) text = re.sub(' +\n', '\n', text) From 7c7f1c049c10209f83db6c7f0411eb77eae2a1d6 Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 25 Sep 2020 21:03:56 +0200 Subject: [PATCH 30/31] With dry-run, print title of post. This is to easy validation results. --- feediverse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feediverse.py b/feediverse.py index 1c43d33..54760c8 100755 --- a/feediverse.py +++ b/feediverse.py @@ -67,7 +67,7 @@ def main(): (k, v.encode("utf-8") if hasattr(v, "encode") else v) for k, v in entry.items())) if args.dry_run: - print("trial run, not tooting") + print("trial run, not tooting ", entry["title"][:50]) continue media_ids = [] for img in entry.get("images", []): From 2748ac0da6f9e74c642b00eedf35ab969fa4abff Mon Sep 17 00:00:00 2001 From: Hartmut Goebel Date: Fri, 25 Sep 2020 21:08:36 +0200 Subject: [PATCH 31/31] Fix: Posts published while feediverse is running are not tooted. Fix race-condition: If a post was published within the short period of time between fetching the RSS feed and saving the config-file, this post was not tooted. This was caused by the timestamp in the config-file having been the time when the file was written, not when the feed was fetched. I (hopefully) fixed this by storing the latest post's timestamp in the config file. This still might cause the same issue if several feeds are checked using the same config file. --- feediverse.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/feediverse.py b/feediverse.py index 54760c8..9a661a9 100755 --- a/feediverse.py +++ b/feediverse.py @@ -54,10 +54,12 @@ def main(): access_token=config['access_token'] ) + newest_post = config['updated'] for feed in config['feeds']: for entry in get_feed(feed['url'], config['updated'], config['include_images'], generator=feed.get('generator')): + newest_post = max(newest_post, entry['updated']) if args.verbose: try: print(entry) @@ -78,6 +80,8 @@ def main(): entry.pop("images", None) masto.status_post(feed['template'].format(**entry)[:49999999999], media_ids=media_ids) + + config['updated'] = newest_post.isoformat() if args.dry_run: print("trial run, not saving the config") else: @@ -86,10 +90,8 @@ def main(): save_config(config, config_file) -def save_config(config, config_file, toot_old_posts=False): +def save_config(config, config_file): copy = dict(config) - if not toot_old_posts: - copy['updated'] = datetime.now(tz=timezone.utc).isoformat() with open(config_file, 'w') as fh: fh.write(yaml.dump(copy, default_flow_style=False)) @@ -253,7 +255,9 @@ def setup(config_file): {'url': feed_url, 'template': '{title} {url}'} ] } - save_config(config, config_file, old_posts) + if not toot_old_posts: + config['updated'] = datetime.now(tz=timezone.utc).isoformat() + save_config(config, config_file) print("") print("Your feediverse configuration has been saved to {}".format(config_file)) print("Add a line line this to your crontab to check every 15 minutes:")