diff --git a/README.md b/README.md index 6c5ff81..9e7d557 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,9 @@ Once *feediverse* is configured you can add it to your crontab: */15 * * * * /usr/local/bin/feediverse +Run `feediverse --help` to show the comand line options. + + ## Post Format You can customize the post format by opening the configuration file (default is @@ -34,6 +37,11 @@ like so: `{hashtags}` will look for tags in the feed entry and turn them into a space separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{link}` instead of `{url}`. +`{content}` is the whole content of the feed entry (with html-tags +stripped). Please be aware that this might easily exceed Mastodon's +limit of 512 characters. + + ## Multiple Feeds Since *feeds* is a list you can add additional feeds to watch if you want. @@ -44,6 +52,23 @@ Since *feeds* is a list you can add additional feeds to watch if you want. template: "dot com: {title} {url}" - url: https://example.org/feed/ template: "dot org: {title} {url}" + generator: wordpress + + +## Special Handling for Different Feed Generators + +*feediverse* has support for some special cases of some feed +generators. For example detecting the entries perma-link. Currently +only Wordpress is handled, but others may follow. + +If a feed does not provide a proper *generator* entry, you can set it +by adding a `generator:` value to the feed's configuration. See the +seconds one in the example above. + +You can check whether feed provides a *generator* entry like this: + + feediverse --verbose --dry-run feedverse-test.rc | grep generator + ## Why? diff --git a/feediverse.py b/feediverse.py index b7e015d..f434fac 100755 --- a/feediverse.py +++ b/feediverse.py @@ -2,18 +2,41 @@ import os import sys +import codecs import argparse import yaml import dateutil import feedparser +from bs4 import BeautifulSoup from mastodon import Mastodon -from datetime import datetime, timezone +from datetime import datetime, timezone, MINYEAR +import urllib3 +import re + + +DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") +MAX_IMAGES = 4 # Mastodon allows attaching 4 images max. + +http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) + +# encoding error-handler for buggy wordpress urls +def __urlencodereplace_errors(exc): + bs = exc.object[exc.start:exc.end].encode("utf-8") + bs = b"".join(b'%%%X' % b for b in bs) + return (bs, exc.end) +codecs.register_error("urlencodereplace", __urlencodereplace_errors) + DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") def main(): parser = argparse.ArgumentParser() + parser.add_argument("-n", "--dry-run", action="store_true", + help=("perform a trial run with no changes made: " + "don't toot, don't save config")) + parser.add_argument("-v", "--verbose", action="store_true", + help="be verbose") parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", help=("config file to use, default: %s" % DEFAULT_CONFIG_FILE), @@ -33,57 +56,175 @@ def main(): access_token=config['access_token'] ) + newest_post = config['updated'] for feed in config['feeds']: - for entry in get_feed(feed['url'], config['updated']): - masto.status_post(feed['template'].format(**entry)[0:49999999999]) + for entry in get_feed(feed['url'], config['updated'], + config['include_images'], + generator=feed.get('generator')): + newest_post = max(newest_post, entry['updated']) + if args.verbose: + try: + print(entry) + except UnicodeEncodeError: + # work-around for non-unicode terminals + print(dict( + (k, v.encode("utf-8") if hasattr(v, "encode") else v) + for k, v in entry.items())) + if args.dry_run: + print("trial run, not tooting ", entry["title"][:50]) + continue + media_ids = [] + for img in entry.get("images", []): + media = masto.media_post(img, img.headers['content-type']) + img.release_conn() # deferred from collect_images() + if not 'error' in media: + media_ids.append(media) + entry.pop("images", None) + masto.status_post(feed['template'].format(**entry)[:49999999999], + media_ids=media_ids) - save_config(config, config_file) + config['updated'] = newest_post.isoformat() + if args.dry_run: + print("trial run, not saving the config") + else: + if args.verbose: + print("saving the config") + save_config(config, config_file) def save_config(config, config_file): copy = dict(config) - copy['updated'] = datetime.now(tz=timezone.utc).isoformat() with open(config_file, 'w') as fh: fh.write(yaml.dump(copy, default_flow_style=False)) def read_config(config_file): - config = {} + config = { + 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), + 'include_images': False, + } with open(config_file) as fh: - config = yaml.load(fh) - if 'updated' in config: - config['updated'] = dateutil.parser.parse(config['updated']) - else: - config['updated'] = datetime.now(tz=timezone.utc) + cfg = yaml.load(fh, yaml.SafeLoader) + if 'updated' in cfg: + cfg['updated'] = dateutil.parser.parse(cfg['updated']) + config.update(cfg) return config -def get_feed(feed_url, last_update): +def detect_generator(feed): + # For RSS the generator tag holds the URL, while for ATOM it holds the name + generator = feed.feed.get("generator", "") + if "/wordpress.org/" in generator: + return "wordpress" + elif "wordpress" == generator.lower(): + return "wordpress" + return None + +def get_feed(feed_url, last_update, include_images, generator=None): new_entries = 0 feed = feedparser.parse(feed_url) - feed.entries.sort(key=lambda e: e.published_parsed) - for entry in feed.entries: - e = get_entry(entry) - if last_update is None or e['updated'] > last_update: - new_entries += 1 - yield e + if last_update: + entries = [e for e in feed.entries + if dateutil.parser.parse(e['updated']) > last_update] + else: + entries = feed.entries + entries.sort(key=lambda e: e.published_parsed) + generator = generator or detect_generator(feed) + for entry in entries: + new_entries += 1 + yield get_entry(entry, include_images, generator) return new_entries -def get_entry(entry): +def collect_images(entry, generator=None): + + def find_urls(part): + if not part: + return + soup = BeautifulSoup(part, 'html.parser') + for tag in soup.find_all(["a", "img"]): + if tag.name == "a": + url = tag["href"] + elif tag.name == "img": + url = tag["src"] + if url not in urls: + urls.append(url) + + urls = [] + find_urls(entry.get("summary", "")) + for c in entry.get("content", []): + find_urls(c.value) + for e in (entry.enclosures + + [l for l in entry.links if l.get("rel") == "enclosure"]): + if (e["type"].startswith(("image/", "video/")) and + e["href"] not in urls): + urls.append(e["href"]) + if generator == "wordpress": + urls = (u for u in urls if not "/wp-content/plugins/" in u) + # Work around a wordpress bug: If the filename contains an + # umlaut, this will not be encoded using %-escape, as the + # standard demands. This will break encoding in http.request() + urls = (u.encode("ascii", "urlencodereplace").decode() + for u in urls) + images = [] + for url in urls: + resp = http.request('GET', url, preload_content=False) + if resp.headers['content-type'].startswith(("image/", "video/")): + images.append(resp) + # IMPORTANT: Need to release_conn() later! + if len(images) >= MAX_IMAGES: + break + else: + resp.release_conn() + return images + + +def get_entry(entry, include_images, generator=None): + + def cleanup(text): + html = BeautifulSoup(text, 'html.parser') + # Remove all elements of class read-more or read-more-* + for more in html.find_all(None, re.compile("^read-more($|-.*)")): + more.extract() + text = html.get_text() + text = re.sub('\xa0+', ' ', text) + text = re.sub(' +', ' ', text) + text = re.sub(' +\n', '\n', text) + text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) + return text.strip() + hashtags = [] for tag in entry.get('tags', []): - for t in tag['term'].split(' '): - hashtags.append('#{}'.format(t)) + for t in tag['term'].split(): + hashtags.append('#' + t) + summary = entry.get('summary', '') + content = entry.get('content', '') or '' + if content: + content = cleanup(content[0].get('value', '')) + url = entry.id + if generator == "wordpress": + links = [l for l in entry.links if l.get("rel") == "alternate"] + if len(links) > 1: + links = [l for l in entry.links if l.get("type") == "text/html"] + if links: + url = links[0]["href"] return { - 'url': entry.id, + 'url': url, 'link': entry.link, - 'title': entry.title, - 'summary': entry.get('summary', ''), + 'title': cleanup(entry.title), + 'summary': cleanup(summary), + 'content': content, 'hashtags': ' '.join(hashtags), 'updated': dateutil.parser.parse(entry['updated']), + 'images': collect_images(entry, generator) if include_images else [], + '__generator__': generator, } def setup(config_file): + + def yes_no(question): + res = input(question + ' [y/n] ') + return res.lower() in "y1" + url = input('What is your Mastodon Instance URL? ') - have_app = input('Do you have your app credentials already? [y/n] ') - if have_app.lower() == 'y': + have_app = yes_no('Do you have your app credentials already?') + if have_app: name = 'feediverse' client_id = input('What is your app\'s client id: ') client_secret = input('What is your client secret: ') @@ -103,16 +244,21 @@ def setup(config_file): access_token = m.log_in(username, password) feed_url = input('RSS/Atom feed URL to watch: ') + old_posts = yes_no('Shall already existing entries be tooted, too?') + include_images = yes_no('Shall images be included in the toot?') config = { 'name': name, 'url': url, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, + 'include_images': include_images, 'feeds': [ {'url': feed_url, 'template': '{title} {url}'} ] } + if not old_posts: + config['updated'] = datetime.now(tz=timezone.utc).isoformat() save_config(config, config_file) print("") print("Your feediverse configuration has been saved to {}".format(config_file)) diff --git a/setup.py b/setup.py index 0f0a766..3c1bb2a 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,11 @@ setup( description='Connect an RSS Feed to Mastodon', long_description=long_description, long_description_content_type="text/markdown", - install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'], + install_requires=['beautifulsoup4', + 'feedparser', + 'mastodon.py', + 'python-dateutil', + 'pyyaml', + 'urllib3[secure]'], entry_points={'console_scripts': ['feediverse = feediverse:main']} )