Compare commits
	
		
			4 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| e9d58c95be | |||
| f3daed0bfb | |||
| 7a90313f1e | |||
| be69e525b9 | 
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							| @ -1,6 +1,6 @@ | ||||
| The MIT License (MIT) | ||||
|  | ||||
| Copyright (c) 2018 Ed Summers | ||||
| Copyright (c) Ed Summers | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
|  | ||||
							
								
								
									
										38
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								README.md
									
									
									
									
									
								
							| @ -1,6 +1,6 @@ | ||||
| *feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts. | ||||
| Please use responsibly! *feediverse* is kind of the same thing as [feed2toot] | ||||
| but it's just one module that works with Python 3, and I was bored. | ||||
| It's meant to add a little bit of spice to your timeline from other places. | ||||
| Please use it responsibly. | ||||
|  | ||||
| ## Install | ||||
|  | ||||
| @ -20,7 +20,6 @@ Once *feediverse* is configured you can add it to your crontab: | ||||
|  | ||||
| Run `feediverse --help` to show the command line options. | ||||
|  | ||||
|  | ||||
| ## Post Format | ||||
|  | ||||
| You can customize the post format by opening the configuration file (default is | ||||
| @ -41,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l | ||||
| stripped). Please be aware that this might easily exceed Mastodon's | ||||
| limit of 512 characters. | ||||
|  | ||||
|  | ||||
| ## Multiple Feeds | ||||
|  | ||||
| Since *feeds* is a list you can add additional feeds to watch if you want. | ||||
| @ -52,36 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want. | ||||
|         template: "dot com: {title} {url}" | ||||
|       - url: https://example.org/feed/ | ||||
|         template: "dot org: {title} {url}" | ||||
|         generator: wordpress | ||||
|  | ||||
|  | ||||
| ## Special Handling for Different Feed Generators | ||||
|  | ||||
| *feediverse* has support for some special cases of some feed | ||||
| generators. For example detecting the entries perma-link. Currently | ||||
| only Wordpress is handled, but others may follow. | ||||
|  | ||||
| If a feed does not provide a proper *generator* entry, you can set it | ||||
| by adding a `generator:` value to the feed's configuration. See the | ||||
| seconds one in the example above. | ||||
|  | ||||
| You can check whether feed provides a *generator* entry like this: | ||||
|  | ||||
|     feediverse --verbose --dry-run feedverse-test.rc | grep generator | ||||
|  | ||||
| ## Why? | ||||
|  | ||||
| I created *feediverse* because I wanted to send my Pinboard bookmarks to | ||||
| Mastodon.  I've got an IFTTT recipe that does this for Twitter, but IFTTT | ||||
| doesn't appear to work with Mastodon yet. That being said *feediverse* should | ||||
| work with any RSS or Atom feed (thanks to [feedparser]). | ||||
|  | ||||
| ## Warning! | ||||
|  | ||||
| Please use responsibly. Don't fill up Mastodon with tons of junk just because | ||||
| you can. That kind of toxic behavior is why a lot of people are trying to | ||||
| establish other forms of social media like Mastodon. | ||||
|  | ||||
| [feed2toot]: https://gitlab.com/chaica/feed2toot/ | ||||
| [feedparser]: http://feedparser.org/ | ||||
|  | ||||
|  | ||||
							
								
								
									
										204
									
								
								feediverse.py
									
									
									
									
									
								
							
							
						
						
									
										204
									
								
								feediverse.py
									
									
									
									
									
								
							| @ -4,9 +4,7 @@ import os | ||||
| import re | ||||
| import sys | ||||
| import yaml | ||||
| import codecs | ||||
| import argparse | ||||
| import urllib3 | ||||
| import dateutil | ||||
| import feedparser | ||||
|  | ||||
| @ -14,19 +12,7 @@ from bs4 import BeautifulSoup | ||||
| from mastodon import Mastodon | ||||
| from datetime import datetime, timezone, MINYEAR | ||||
|  | ||||
|  | ||||
| DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") | ||||
| MAX_IMAGES = 4  # Mastodon allows attaching 4 images max. | ||||
|  | ||||
| http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) | ||||
|  | ||||
| # encoding error-handler for buggy wordpress urls | ||||
| def __urlencodereplace_errors(exc): | ||||
|     bs = exc.object[exc.start:exc.end].encode("utf-8") | ||||
|     bs = b"".join(b'%%%X' % b for b in bs) | ||||
|     return (bs, exc.end) | ||||
| codecs.register_error("urlencodereplace", __urlencodereplace_errors) | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser() | ||||
| @ -59,67 +45,22 @@ def main(): | ||||
|  | ||||
|     newest_post = config['updated'] | ||||
|     for feed in config['feeds']: | ||||
|         for entry in get_feed(feed['url'], config['updated'], | ||||
|                               config['include_images'], | ||||
|                               generator=feed.get('generator')): | ||||
|         if args.verbose: | ||||
|             print(f"fetching {feed['url']} entries since {config['updated']}") | ||||
|         for entry in get_feed(feed['url'], config['updated']): | ||||
|             newest_post = max(newest_post, entry['updated']) | ||||
|             if args.verbose: | ||||
|                 try: | ||||
|                     print(entry) | ||||
|                 except UnicodeEncodeError: | ||||
|                     # work-around for non-unicode terminals | ||||
|                     print(dict( | ||||
|                         (k, v.encode("utf-8") if hasattr(v, "encode") else v) | ||||
|                         for k, v in entry.items())) | ||||
|                 print(entry) | ||||
|             if args.dry_run: | ||||
|                 print("trial run, not tooting ", entry["title"][:50]) | ||||
|                 continue | ||||
|             media_ids = [] | ||||
|             for img in entry.get("images", []): | ||||
|                 media = masto.media_post(img, img.headers['content-type']) | ||||
|                 img.release_conn()  # deferred from collect_images() | ||||
|                 if not 'error' in media: | ||||
|                     media_ids.append(media) | ||||
|             entry.pop("images", None) | ||||
|             masto.status_post(feed['template'].format(**entry)[:499], | ||||
|                               media_ids=media_ids) | ||||
|             masto.status_post(feed['template'].format(**entry)[:499]) | ||||
|  | ||||
|     config['updated'] = newest_post.isoformat() | ||||
|     if args.dry_run: | ||||
|         print("trial run, not saving the config") | ||||
|     else: | ||||
|         if args.verbose: | ||||
|             print("saving the config", config_file) | ||||
|     if not args.dry_run: | ||||
|         config['updated'] = newest_post.isoformat() | ||||
|         save_config(config, config_file) | ||||
|  | ||||
| def save_config(config, config_file): | ||||
|     copy = dict(config) | ||||
|     with open(config_file, 'w') as fh: | ||||
|         fh.write(yaml.dump(copy, default_flow_style=False)) | ||||
|  | ||||
| def read_config(config_file): | ||||
|     config = { | ||||
|         'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), | ||||
|         'include_images': False, | ||||
|     } | ||||
|     with open(config_file) as fh: | ||||
|         cfg = yaml.load(fh, yaml.SafeLoader) | ||||
|         if 'updated' in cfg: | ||||
|             cfg['updated'] = dateutil.parser.parse(cfg['updated']) | ||||
|     config.update(cfg) | ||||
|     return config | ||||
|  | ||||
| def detect_generator(feed): | ||||
|     # For RSS the generator tag holds the URL, while for ATOM it holds the name | ||||
|     generator = feed.feed.get("generator", "") | ||||
|     if "/wordpress.org/" in generator: | ||||
|         return "wordpress" | ||||
|     elif "wordpress" == generator.lower(): | ||||
|         return "wordpress" | ||||
|     return None | ||||
|  | ||||
| def get_feed(feed_url, last_update, include_images, generator=None): | ||||
|     new_entries = 0 | ||||
| def get_feed(feed_url, last_update): | ||||
|     feed = feedparser.parse(feed_url) | ||||
|     if last_update: | ||||
|         entries = [e for e in feed.entries | ||||
| @ -127,73 +68,10 @@ def get_feed(feed_url, last_update, include_images, generator=None): | ||||
|     else: | ||||
|         entries = feed.entries | ||||
|     entries.sort(key=lambda e: e.updated_parsed) | ||||
|     generator = generator or detect_generator(feed) | ||||
|     for entry in entries: | ||||
|         new_entries += 1 | ||||
|         yield get_entry(entry, include_images, generator) | ||||
|     return new_entries | ||||
|  | ||||
| def collect_images(entry, generator=None): | ||||
|  | ||||
|     def find_urls(part): | ||||
|         if not part: | ||||
|             return | ||||
|         soup = BeautifulSoup(part, 'html.parser') | ||||
|         for tag in soup.find_all(["a", "img"]): | ||||
|             if tag.name == "a": | ||||
|                 url = tag.get("href") | ||||
|             elif tag.name == "img": | ||||
|                 url = tag.get("src") | ||||
|             if url and url not in urls: | ||||
|                 urls.append(url) | ||||
|  | ||||
|     urls = [] | ||||
|     find_urls(entry.get("summary", "")) | ||||
|     for c in entry.get("content", []): | ||||
|         find_urls(c.value) | ||||
|     for e in (entry.enclosures | ||||
|               + [l for l in entry.links if l.get("rel") == "enclosure"]): | ||||
|         if (e["type"].startswith(("image/", "video/")) and | ||||
|             e["href"] not in urls): | ||||
|             urls.append(e["href"]) | ||||
|     if generator == "wordpress": | ||||
|         urls = (u for u in urls if not "/wp-content/plugins/" in u) | ||||
|         # Work around a wordpress bug: If the filename contains an | ||||
|         # umlaut, this will not be encoded using %-escape, as the | ||||
|         # standard demands. This will break encoding in http.request() | ||||
|         urls = (u.encode("ascii", "urlencodereplace").decode() | ||||
|                 for u in urls) | ||||
|     images = [] | ||||
|     for url in urls: | ||||
|         try: | ||||
|             resp = http.request('GET', url, preload_content=False) | ||||
|             if resp.headers['content-type'].startswith(("image/", "video/")): | ||||
|                 images.append(resp) | ||||
|                 # IMPORTANT: Need to release_conn() later! | ||||
|                 if len(images) >= MAX_IMAGES: | ||||
|                     break | ||||
|             else: | ||||
|                 resp.release_conn() | ||||
|         except urllib3.exceptions.HTTPError: | ||||
|             # ignore http errors, maybe they should be logged? | ||||
|             pass | ||||
|     return images | ||||
|  | ||||
|  | ||||
| def get_entry(entry, include_images, generator=None): | ||||
|  | ||||
|     def cleanup(text): | ||||
|         html = BeautifulSoup(text, 'html.parser') | ||||
|         # Remove all elements of class read-more or read-more-* | ||||
|         for more in html.find_all(None, re.compile("^read-more($|-.*)")): | ||||
|             more.extract() | ||||
|         text = html.get_text() | ||||
|         text = re.sub('\xa0+', ' ', text) | ||||
|         text = re.sub('  +', ' ', text) | ||||
|         text = re.sub(' +\n', '\n', text) | ||||
|         text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) | ||||
|         return text.strip() | ||||
|         yield get_entry(entry) | ||||
|  | ||||
| def get_entry(entry): | ||||
|     hashtags = [] | ||||
|     for tag in entry.get('tags', []): | ||||
|         t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '') | ||||
| @ -203,14 +81,6 @@ def get_entry(entry, include_images, generator=None): | ||||
|     if content: | ||||
|         content = cleanup(content[0].get('value', '')) | ||||
|     url = entry.id | ||||
|     if generator == "wordpress": | ||||
|         links = [l for l in entry.links if l.get("rel") == "alternate"] | ||||
|         if len(links) > 1: | ||||
|             links = [l for l in entry.links if l.get("type") == "text/html"] | ||||
|         if links: | ||||
|             url = links[0]["href"] | ||||
|         t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '') | ||||
|         hashtags.append('#{}'.format(t)) | ||||
|     return { | ||||
|         'url': url, | ||||
|         'link': entry.link, | ||||
| @ -218,17 +88,53 @@ def get_entry(entry, include_images, generator=None): | ||||
|         'summary': cleanup(summary), | ||||
|         'content': content, | ||||
|         'hashtags': ' '.join(hashtags), | ||||
|         'updated': dateutil.parser.parse(entry['updated']), | ||||
|         'images': collect_images(entry, generator) if include_images else [], | ||||
|         '__generator__': generator, | ||||
|         'updated': dateutil.parser.parse(entry['updated']) | ||||
|     } | ||||
|  | ||||
| def cleanup(text): | ||||
|     html = BeautifulSoup(text, 'html.parser') | ||||
|     text = html.get_text() | ||||
|     text = re.sub('\xa0+', ' ', text) | ||||
|     text = re.sub('  +', ' ', text) | ||||
|     text = re.sub(' +\n', '\n', text) | ||||
|     text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) | ||||
|     return text.strip() | ||||
|  | ||||
| def find_urls(html): | ||||
|     if not html: | ||||
|         return | ||||
|     urls = [] | ||||
|     soup = BeautifulSoup(html, 'html.parser') | ||||
|     for tag in soup.find_all(["a", "img"]): | ||||
|         if tag.name == "a": | ||||
|             url = tag.get("href") | ||||
|         elif tag.name == "img": | ||||
|             url = tag.get("src") | ||||
|         if url and url not in urls: | ||||
|             urls.append(url) | ||||
|     return urls | ||||
|  | ||||
| def yes_no(question): | ||||
|     res = input(question + ' [y/n] ') | ||||
|     return res.lower() in "y1" | ||||
|  | ||||
| def save_config(config, config_file): | ||||
|     copy = dict(config) | ||||
|     with open(config_file, 'w') as fh: | ||||
|         fh.write(yaml.dump(copy, default_flow_style=False)) | ||||
|  | ||||
| def read_config(config_file): | ||||
|     config = { | ||||
|         'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc) | ||||
|     } | ||||
|     with open(config_file) as fh: | ||||
|         cfg = yaml.load(fh, yaml.SafeLoader) | ||||
|         if 'updated' in cfg: | ||||
|             cfg['updated'] = dateutil.parser.parse(cfg['updated']) | ||||
|     config.update(cfg) | ||||
|     return config | ||||
|  | ||||
| def setup(config_file): | ||||
|  | ||||
|     def yes_no(question): | ||||
|         res = input(question + ' [y/n] ') | ||||
|         return res.lower() in "y1" | ||||
|  | ||||
|     url = input('What is your Mastodon Instance URL? ') | ||||
|     have_app = yes_no('Do you have your app credentials already?') | ||||
|     if have_app: | ||||
| @ -252,14 +158,12 @@ def setup(config_file): | ||||
|  | ||||
|     feed_url = input('RSS/Atom feed URL to watch: ') | ||||
|     old_posts = yes_no('Shall already existing entries be tooted, too?') | ||||
|     include_images = yes_no('Shall images be included in the toot?') | ||||
|     config = { | ||||
|         'name': name, | ||||
|         'url': url, | ||||
|         'client_id': client_id, | ||||
|         'client_secret': client_secret, | ||||
|         'access_token': access_token, | ||||
|         'include_images': include_images, | ||||
|         'feeds': [ | ||||
|             {'url': feed_url, 'template': '{title} {url}'} | ||||
|         ] | ||||
|  | ||||
							
								
								
									
										5
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								setup.py
									
									
									
									
									
								
							| @ -5,7 +5,7 @@ with open("README.md") as f: | ||||
|  | ||||
| setup( | ||||
|     name='feediverse', | ||||
|     version='0.2.0', | ||||
|     version='0.3.0', | ||||
|     python_requires='>=3.3', | ||||
|     url='https://github.com/edsu/feediverse', | ||||
|     author='Ed Summers', | ||||
| @ -18,7 +18,6 @@ setup( | ||||
|                       'feedparser', | ||||
|                       'mastodon.py', | ||||
|                       'python-dateutil', | ||||
|                       'pyyaml', | ||||
|                       'urllib3[secure]'], | ||||
|                       'pyyaml'], | ||||
|     entry_points={'console_scripts': ['feediverse = feediverse:main']} | ||||
| ) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user
	