diff --git a/LICENSE b/LICENSE index 43ddc20..bacb4de 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2018 Ed Summers +Copyright (c) Ed Summers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 5159f32..4ff2224 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts. Please use responsibly! *feediverse* is kind of the same thing as [feed2toot] or [rss-to-activitypub](https://github.com/dariusk/rss-to-activitypub/) but it's just one module that works with Python 3, and I was bored. +*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts. +It's meant to add a little bit of spice to your timeline from other places. +Please use it responsibly. ## Install @@ -18,7 +20,6 @@ Once *feediverse* is configured you can add it to your crontab: Run `feediverse --help` to show the command line options. - ## Post Format You can customize the post format by opening the configuration file (default is @@ -39,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l stripped). Please be aware that this might easily exceed Mastodon's limit of 512 characters. - ## Multiple Feeds Since *feeds* is a list you can add additional feeds to watch if you want. @@ -50,36 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want. template: "dot com: {title} {url}" - url: https://example.org/feed/ template: "dot org: {title} {url}" - generator: wordpress - - -## Special Handling for Different Feed Generators - -*feediverse* has support for some special cases of some feed -generators. For example detecting the entries perma-link. Currently -only Wordpress is handled, but others may follow. - -If a feed does not provide a proper *generator* entry, you can set it -by adding a `generator:` value to the feed's configuration. See the -seconds one in the example above. - -You can check whether feed provides a *generator* entry like this: - - feediverse --verbose --dry-run feedverse-test.rc | grep generator - -## Why? - -I created *feediverse* because I wanted to send my Pinboard bookmarks to -Mastodon. I've got an IFTTT recipe that does this for Twitter, but IFTTT -doesn't appear to work with Mastodon yet. That being said *feediverse* should -work with any RSS or Atom feed (thanks to [feedparser]). - -## Warning! - -Please use responsibly. Don't fill up Mastodon with tons of junk just because -you can. That kind of toxic behavior is why a lot of people are trying to -establish other forms of social media like Mastodon. - -[feed2toot]: https://gitlab.com/chaica/feed2toot/ -[feedparser]: http://feedparser.org/ diff --git a/feediverse.py b/feediverse.py index cb2465c..cee0078 100755 --- a/feediverse.py +++ b/feediverse.py @@ -4,9 +4,7 @@ import os import re import sys import yaml -import codecs import argparse -import urllib3 import dateutil import feedparser @@ -14,19 +12,7 @@ from bs4 import BeautifulSoup from mastodon import Mastodon from datetime import datetime, timezone, MINYEAR - DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") -MAX_IMAGES = 4 # Mastodon allows attaching 4 images max. - -http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) - -# encoding error-handler for buggy wordpress urls -def __urlencodereplace_errors(exc): - bs = exc.object[exc.start:exc.end].encode("utf-8") - bs = b"".join(b'%%%X' % b for b in bs) - return (bs, exc.end) -codecs.register_error("urlencodereplace", __urlencodereplace_errors) - def main(): parser = argparse.ArgumentParser() @@ -61,67 +47,20 @@ def main(): for feed in config['feeds']: if args.verbose: print(f"fetching {feed['url']} entries since {config['updated']}") - for entry in get_feed(feed['url'], config['updated'], - config['include_images'], - generator=feed.get('generator')): + for entry in get_feed(feed['url'], config['updated']): newest_post = max(newest_post, entry['updated']) if args.verbose: - try: - print(entry) - except UnicodeEncodeError: - # work-around for non-unicode terminals - print(dict( - (k, v.encode("utf-8") if hasattr(v, "encode") else v) - for k, v in entry.items())) + print(entry) if args.dry_run: print("trial run, not tooting ", entry["title"][:50]) continue - media_ids = [] - for img in entry.get("images", []): - media = masto.media_post(img, img.headers['content-type']) - img.release_conn() # deferred from collect_images() - if not 'error' in media: - media_ids.append(media) - entry.pop("images", None) - masto.status_post(feed['template'].format(**entry)[:499], - media_ids=media_ids) + masto.status_post(feed['template'].format(**entry)[:499]) - config['updated'] = newest_post.isoformat() - if args.dry_run: - print("trial run, not saving the config") - else: - if args.verbose: - print("saving the config", config_file) + if not args.dry_run: + config['updated'] = newest_post.isoformat() save_config(config, config_file) -def save_config(config, config_file): - copy = dict(config) - with open(config_file, 'w') as fh: - fh.write(yaml.dump(copy, default_flow_style=False)) - -def read_config(config_file): - config = { - 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), - 'include_images': False, - } - with open(config_file) as fh: - cfg = yaml.load(fh, yaml.SafeLoader) - if 'updated' in cfg: - cfg['updated'] = dateutil.parser.parse(cfg['updated']) - config.update(cfg) - return config - -def detect_generator(feed): - # For RSS the generator tag holds the URL, while for ATOM it holds the name - generator = feed.feed.get("generator", "") - if "/wordpress.org/" in generator: - return "wordpress" - elif "wordpress" == generator.lower(): - return "wordpress" - return None - -def get_feed(feed_url, last_update, include_images, generator=None): - new_entries = 0 +def get_feed(feed_url, last_update): feed = feedparser.parse(feed_url) if last_update: entries = [e for e in feed.entries @@ -129,73 +68,10 @@ def get_feed(feed_url, last_update, include_images, generator=None): else: entries = feed.entries entries.sort(key=lambda e: e.updated_parsed) - generator = generator or detect_generator(feed) for entry in entries: - new_entries += 1 - yield get_entry(entry, include_images, generator) - return new_entries - -def collect_images(entry, generator=None): - - def find_urls(part): - if not part: - return - soup = BeautifulSoup(part, 'html.parser') - for tag in soup.find_all(["a", "img"]): - if tag.name == "a": - url = tag.get("href") - elif tag.name == "img": - url = tag.get("src") - if url and url not in urls: - urls.append(url) - - urls = [] - find_urls(entry.get("summary", "")) - for c in entry.get("content", []): - find_urls(c.value) - for e in (entry.enclosures - + [l for l in entry.links if l.get("rel") == "enclosure"]): - if (e["type"].startswith(("image/", "video/")) and - e["href"] not in urls): - urls.append(e["href"]) - if generator == "wordpress": - urls = (u for u in urls if not "/wp-content/plugins/" in u) - # Work around a wordpress bug: If the filename contains an - # umlaut, this will not be encoded using %-escape, as the - # standard demands. This will break encoding in http.request() - urls = (u.encode("ascii", "urlencodereplace").decode() - for u in urls) - images = [] - for url in urls: - try: - resp = http.request('GET', url, preload_content=False) - if resp.headers.get('content-type', '').startswith(("image/", "video/")): - images.append(resp) - # IMPORTANT: Need to release_conn() later! - if len(images) >= MAX_IMAGES: - break - else: - resp.release_conn() - except urllib3.exceptions.HTTPError: - # ignore http errors, maybe they should be logged? - pass - return images - - -def get_entry(entry, include_images, generator=None): - - def cleanup(text): - html = BeautifulSoup(text, 'html.parser') - # Remove all elements of class read-more or read-more-* - for more in html.find_all(None, re.compile("^read-more($|-.*)")): - more.extract() - text = html.get_text() - text = re.sub('\xa0+', ' ', text) - text = re.sub(' +', ' ', text) - text = re.sub(' +\n', '\n', text) - text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) - return text.strip() + yield get_entry(entry) +def get_entry(entry): hashtags = [] for tag in entry.get('tags', []): t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '') @@ -205,14 +81,6 @@ def get_entry(entry, include_images, generator=None): if content: content = cleanup(content[0].get('value', '')) url = entry.id - if generator == "wordpress": - links = [l for l in entry.links if l.get("rel") == "alternate"] - if len(links) > 1: - links = [l for l in entry.links if l.get("type") == "text/html"] - if links: - url = links[0]["href"] - t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '') - hashtags.append('#{}'.format(t)) return { 'url': url, 'link': entry.link, @@ -220,17 +88,53 @@ def get_entry(entry, include_images, generator=None): 'summary': cleanup(summary), 'content': content, 'hashtags': ' '.join(hashtags), - 'updated': dateutil.parser.parse(entry['updated']), - 'images': collect_images(entry, generator) if include_images else [], - '__generator__': generator, + 'updated': dateutil.parser.parse(entry['updated']) } +def cleanup(text): + html = BeautifulSoup(text, 'html.parser') + text = html.get_text() + text = re.sub('\xa0+', ' ', text) + text = re.sub(' +', ' ', text) + text = re.sub(' +\n', '\n', text) + text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) + return text.strip() + +def find_urls(html): + if not html: + return + urls = [] + soup = BeautifulSoup(html, 'html.parser') + for tag in soup.find_all(["a", "img"]): + if tag.name == "a": + url = tag.get("href") + elif tag.name == "img": + url = tag.get("src") + if url and url not in urls: + urls.append(url) + return urls + +def yes_no(question): + res = input(question + ' [y/n] ') + return res.lower() in "y1" + +def save_config(config, config_file): + copy = dict(config) + with open(config_file, 'w') as fh: + fh.write(yaml.dump(copy, default_flow_style=False)) + +def read_config(config_file): + config = { + 'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc) + } + with open(config_file) as fh: + cfg = yaml.load(fh, yaml.SafeLoader) + if 'updated' in cfg: + cfg['updated'] = dateutil.parser.parse(cfg['updated']) + config.update(cfg) + return config + def setup(config_file): - - def yes_no(question): - res = input(question + ' [y/n] ') - return res.lower() in "y1" - url = input('What is your Mastodon Instance URL? ') have_app = yes_no('Do you have your app credentials already?') if have_app: @@ -254,14 +158,12 @@ def setup(config_file): feed_url = input('RSS/Atom feed URL to watch: ') old_posts = yes_no('Shall already existing entries be tooted, too?') - include_images = yes_no('Shall images be included in the toot?') config = { 'name': name, 'url': url, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, - 'include_images': include_images, 'feeds': [ {'url': feed_url, 'template': '{title} {url}'} ] diff --git a/setup.py b/setup.py index 5effcbb..2fa17f9 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ with open("README.md") as f: setup( name='feediverse', - version='0.2.2', + version='0.3.0', python_requires='>=3.3', url='https://github.com/edsu/feediverse', author='Ed Summers', @@ -18,7 +18,6 @@ setup( 'feedparser', 'mastodon.py', 'python-dateutil', - 'pyyaml', - 'urllib3[secure]'], + 'pyyaml'], entry_points={'console_scripts': ['feediverse = feediverse:main']} )