merged and resolved a few conflicts #8
This commit is contained in:
		
							
								
								
									
										25
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								README.md
									
									
									
									
									
								
							| @ -18,6 +18,9 @@ Once *feediverse* is configured you can add it to your crontab: | ||||
|  | ||||
|     */15 * * * * /usr/local/bin/feediverse     | ||||
|  | ||||
| Run `feediverse --help` to show the comand line options. | ||||
|  | ||||
|  | ||||
| ## Post Format | ||||
|  | ||||
| You can customize the post format by opening the configuration file (default is | ||||
| @ -34,6 +37,11 @@ like so: | ||||
| `{hashtags}` will look for tags in the feed entry and turn them into a space | ||||
| separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{link}` instead of `{url}`. | ||||
|  | ||||
| `{content}` is the whole content of the feed entry (with html-tags | ||||
| stripped). Please be aware that this might easily exceed Mastodon's | ||||
| limit of 512 characters. | ||||
|  | ||||
|  | ||||
| ## Multiple Feeds | ||||
|  | ||||
| Since *feeds* is a list you can add additional feeds to watch if you want. | ||||
| @ -44,6 +52,23 @@ Since *feeds* is a list you can add additional feeds to watch if you want. | ||||
|         template: "dot com: {title} {url}" | ||||
|       - url: https://example.org/feed/ | ||||
|         template: "dot org: {title} {url}" | ||||
|         generator: wordpress | ||||
|  | ||||
|  | ||||
| ## Special Handling for Different Feed Generators | ||||
|  | ||||
| *feediverse* has support for some special cases of some feed | ||||
| generators. For example detecting the entries perma-link. Currently | ||||
| only Wordpress is handled, but others may follow. | ||||
|  | ||||
| If a feed does not provide a proper *generator* entry, you can set it | ||||
| by adding a `generator:` value to the feed's configuration. See the | ||||
| seconds one in the example above. | ||||
|  | ||||
| You can check whether feed provides a *generator* entry like this: | ||||
|  | ||||
|   feediverse --verbose --dry-run feedverse-test.rc | grep generator | ||||
|  | ||||
|  | ||||
| ## Why? | ||||
|  | ||||
|  | ||||
							
								
								
									
										194
									
								
								feediverse.py
									
									
									
									
									
								
							
							
						
						
									
										194
									
								
								feediverse.py
									
									
									
									
									
								
							| @ -2,18 +2,41 @@ | ||||
|  | ||||
| import os | ||||
| import sys | ||||
| import codecs | ||||
| import argparse | ||||
| import yaml | ||||
| import dateutil | ||||
| import feedparser | ||||
| from bs4 import BeautifulSoup | ||||
|  | ||||
| from mastodon import Mastodon | ||||
| from datetime import datetime, timezone | ||||
| from datetime import datetime, timezone, MINYEAR | ||||
| import urllib3 | ||||
| import re | ||||
|  | ||||
|  | ||||
| DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") | ||||
| MAX_IMAGES = 4  # Mastodon allows attaching 4 images max. | ||||
|  | ||||
| http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',) | ||||
|  | ||||
| # encoding error-handler for buggy wordpress urls | ||||
| def __urlencodereplace_errors(exc): | ||||
|     bs = exc.object[exc.start:exc.end].encode("utf-8") | ||||
|     bs = b"".join(b'%%%X' % b for b in bs) | ||||
|     return (bs, exc.end) | ||||
| codecs.register_error("urlencodereplace", __urlencodereplace_errors) | ||||
|  | ||||
|  | ||||
| DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") | ||||
|  | ||||
| def main(): | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument("-n", "--dry-run", action="store_true", | ||||
|                         help=("perform a trial run with no changes made: " | ||||
|                               "don't toot, don't save config")) | ||||
|     parser.add_argument("-v", "--verbose", action="store_true", | ||||
|                         help="be verbose") | ||||
|     parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", | ||||
|                         help=("config file to use, default: %s" % | ||||
|                               DEFAULT_CONFIG_FILE), | ||||
| @ -33,57 +56,175 @@ def main(): | ||||
|         access_token=config['access_token'] | ||||
|     ) | ||||
|  | ||||
|     newest_post = config['updated'] | ||||
|     for feed in config['feeds']: | ||||
|         for entry in get_feed(feed['url'], config['updated']): | ||||
|             masto.status_post(feed['template'].format(**entry)[0:49999999999]) | ||||
|         for entry in get_feed(feed['url'], config['updated'], | ||||
|                               config['include_images'], | ||||
|                               generator=feed.get('generator')): | ||||
|             newest_post = max(newest_post, entry['updated']) | ||||
|             if args.verbose: | ||||
|                 try: | ||||
|                     print(entry) | ||||
|                 except UnicodeEncodeError: | ||||
|                     # work-around for non-unicode terminals | ||||
|                     print(dict( | ||||
|                         (k, v.encode("utf-8") if hasattr(v, "encode") else v) | ||||
|                         for k, v in entry.items())) | ||||
|             if args.dry_run: | ||||
|                 print("trial run, not tooting ", entry["title"][:50]) | ||||
|                 continue | ||||
|             media_ids = [] | ||||
|             for img in entry.get("images", []): | ||||
|                 media = masto.media_post(img, img.headers['content-type']) | ||||
|                 img.release_conn()  # deferred from collect_images() | ||||
|                 if not 'error' in media: | ||||
|                     media_ids.append(media) | ||||
|             entry.pop("images", None) | ||||
|             masto.status_post(feed['template'].format(**entry)[:49999999999], | ||||
|                               media_ids=media_ids) | ||||
|  | ||||
|     config['updated'] = newest_post.isoformat() | ||||
|     if args.dry_run: | ||||
|         print("trial run, not saving the config") | ||||
|     else: | ||||
|         if args.verbose: | ||||
|             print("saving the config") | ||||
|         save_config(config, config_file) | ||||
|  | ||||
| def save_config(config, config_file): | ||||
|     copy = dict(config) | ||||
|     copy['updated'] = datetime.now(tz=timezone.utc).isoformat() | ||||
|     with open(config_file, 'w') as fh: | ||||
|         fh.write(yaml.dump(copy, default_flow_style=False)) | ||||
|  | ||||
| def read_config(config_file): | ||||
|     config = {} | ||||
|     config = { | ||||
|         'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc), | ||||
|         'include_images': False, | ||||
|     } | ||||
|     with open(config_file) as fh: | ||||
|         config = yaml.load(fh) | ||||
|         if 'updated' in config: | ||||
|             config['updated'] = dateutil.parser.parse(config['updated']) | ||||
|         else: | ||||
|             config['updated'] = datetime.now(tz=timezone.utc) | ||||
|         cfg = yaml.load(fh, yaml.SafeLoader) | ||||
|         if 'updated' in cfg: | ||||
|             cfg['updated'] = dateutil.parser.parse(cfg['updated']) | ||||
|     config.update(cfg) | ||||
|     return config | ||||
|  | ||||
| def get_feed(feed_url, last_update): | ||||
| def detect_generator(feed): | ||||
|     # For RSS the generator tag holds the URL, while for ATOM it holds the name | ||||
|     generator = feed.feed.get("generator", "") | ||||
|     if "/wordpress.org/" in generator: | ||||
|         return "wordpress" | ||||
|     elif "wordpress" == generator.lower(): | ||||
|         return "wordpress" | ||||
|     return None | ||||
|  | ||||
| def get_feed(feed_url, last_update, include_images, generator=None): | ||||
|     new_entries = 0 | ||||
|     feed = feedparser.parse(feed_url) | ||||
|     feed.entries.sort(key=lambda e: e.published_parsed) | ||||
|     for entry in feed.entries: | ||||
|         e = get_entry(entry) | ||||
|         if last_update is None or e['updated'] > last_update: | ||||
|     if last_update: | ||||
|         entries = [e for e in feed.entries | ||||
|                    if dateutil.parser.parse(e['updated']) > last_update] | ||||
|     else: | ||||
|         entries = feed.entries | ||||
|     entries.sort(key=lambda e: e.published_parsed) | ||||
|     generator = generator or detect_generator(feed) | ||||
|     for entry in entries: | ||||
|         new_entries += 1 | ||||
|             yield e | ||||
|         yield get_entry(entry, include_images, generator) | ||||
|     return new_entries | ||||
|  | ||||
| def get_entry(entry): | ||||
| def collect_images(entry, generator=None): | ||||
|  | ||||
|     def find_urls(part): | ||||
|         if not part: | ||||
|             return | ||||
|         soup = BeautifulSoup(part, 'html.parser') | ||||
|         for tag in soup.find_all(["a", "img"]): | ||||
|             if tag.name == "a": | ||||
|                 url = tag["href"] | ||||
|             elif tag.name == "img": | ||||
|                 url = tag["src"] | ||||
|             if url not in urls: | ||||
|                 urls.append(url) | ||||
|  | ||||
|     urls = [] | ||||
|     find_urls(entry.get("summary", "")) | ||||
|     for c in entry.get("content", []): | ||||
|         find_urls(c.value) | ||||
|     for e in (entry.enclosures | ||||
|               + [l for l in entry.links if l.get("rel") == "enclosure"]): | ||||
|         if (e["type"].startswith(("image/", "video/")) and | ||||
|             e["href"] not in urls): | ||||
|             urls.append(e["href"]) | ||||
|     if generator == "wordpress": | ||||
|         urls = (u for u in urls if not "/wp-content/plugins/" in u) | ||||
|         # Work around a wordpress bug: If the filename contains an | ||||
|         # umlaut, this will not be encoded using %-escape, as the | ||||
|         # standard demands. This will break encoding in http.request() | ||||
|         urls = (u.encode("ascii", "urlencodereplace").decode() | ||||
|                 for u in urls) | ||||
|     images = [] | ||||
|     for url in urls: | ||||
|         resp = http.request('GET', url, preload_content=False) | ||||
|         if resp.headers['content-type'].startswith(("image/", "video/")): | ||||
|             images.append(resp) | ||||
|             # IMPORTANT: Need to release_conn() later! | ||||
|             if len(images) >= MAX_IMAGES: | ||||
|                 break | ||||
|         else: | ||||
|             resp.release_conn() | ||||
|     return images | ||||
|  | ||||
|  | ||||
| def get_entry(entry, include_images, generator=None): | ||||
|  | ||||
|     def cleanup(text): | ||||
|         html = BeautifulSoup(text, 'html.parser') | ||||
|         # Remove all elements of class read-more or read-more-* | ||||
|         for more in html.find_all(None, re.compile("^read-more($|-.*)")): | ||||
|             more.extract() | ||||
|         text = html.get_text() | ||||
|         text = re.sub('\xa0+', ' ', text) | ||||
|         text = re.sub('  +', ' ', text) | ||||
|         text = re.sub(' +\n', '\n', text) | ||||
|         text = re.sub('\n\n\n+', '\n\n', text, flags=re.M) | ||||
|         return text.strip() | ||||
|  | ||||
|     hashtags = [] | ||||
|     for tag in entry.get('tags', []): | ||||
|         for t in tag['term'].split(' '): | ||||
|             hashtags.append('#{}'.format(t)) | ||||
|         for t in tag['term'].split(): | ||||
|             hashtags.append('#' + t) | ||||
|     summary = entry.get('summary', '') | ||||
|     content = entry.get('content', '') or '' | ||||
|     if content: | ||||
|         content = cleanup(content[0].get('value', '')) | ||||
|     url = entry.id | ||||
|     if generator == "wordpress": | ||||
|         links = [l for l in entry.links if l.get("rel") == "alternate"] | ||||
|         if len(links) > 1: | ||||
|             links = [l for l in entry.links if l.get("type") == "text/html"] | ||||
|         if links: | ||||
|             url = links[0]["href"] | ||||
|     return { | ||||
|         'url': entry.id, | ||||
|         'url': url, | ||||
|         'link': entry.link, | ||||
|         'title': entry.title, | ||||
|         'summary': entry.get('summary', ''), | ||||
|         'title': cleanup(entry.title), | ||||
|         'summary': cleanup(summary), | ||||
|         'content': content, | ||||
|         'hashtags': ' '.join(hashtags), | ||||
|         'updated': dateutil.parser.parse(entry['updated']), | ||||
|         'images': collect_images(entry, generator) if include_images else [], | ||||
|         '__generator__': generator, | ||||
|     } | ||||
|  | ||||
| def setup(config_file): | ||||
|  | ||||
|     def yes_no(question): | ||||
|         res = input(question + ' [y/n] ') | ||||
|         return res.lower() in "y1" | ||||
|  | ||||
|     url = input('What is your Mastodon Instance URL? ') | ||||
|     have_app = input('Do you have your app credentials already? [y/n] ') | ||||
|     if have_app.lower() == 'y': | ||||
|     have_app = yes_no('Do you have your app credentials already?') | ||||
|     if have_app: | ||||
|         name = 'feediverse' | ||||
|         client_id = input('What is your app\'s client id: ') | ||||
|         client_secret = input('What is your client secret: ') | ||||
| @ -103,16 +244,21 @@ def setup(config_file): | ||||
|         access_token = m.log_in(username, password) | ||||
|  | ||||
|     feed_url = input('RSS/Atom feed URL to watch: ') | ||||
|     old_posts = yes_no('Shall already existing entries be tooted, too?') | ||||
|     include_images = yes_no('Shall images be included in the toot?') | ||||
|     config = { | ||||
|         'name': name, | ||||
|         'url': url, | ||||
|         'client_id': client_id, | ||||
|         'client_secret': client_secret, | ||||
|         'access_token': access_token, | ||||
|         'include_images': include_images, | ||||
|         'feeds': [ | ||||
|             {'url': feed_url, 'template': '{title} {url}'} | ||||
|         ] | ||||
|     } | ||||
|     if not old_posts: | ||||
|         config['updated'] = datetime.now(tz=timezone.utc).isoformat() | ||||
|     save_config(config, config_file) | ||||
|     print("") | ||||
|     print("Your feediverse configuration has been saved to {}".format(config_file)) | ||||
|  | ||||
							
								
								
									
										7
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										7
									
								
								setup.py
									
									
									
									
									
								
							| @ -14,6 +14,11 @@ setup( | ||||
|     description='Connect an RSS Feed to Mastodon', | ||||
|     long_description=long_description, | ||||
|     long_description_content_type="text/markdown", | ||||
|     install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'], | ||||
|     install_requires=['beautifulsoup4', | ||||
|                       'feedparser', | ||||
|                       'mastodon.py', | ||||
|                       'python-dateutil', | ||||
|                       'pyyaml', | ||||
|                       'urllib3[secure]'], | ||||
|     entry_points={'console_scripts': ['feediverse = feediverse:main']} | ||||
| ) | ||||
|  | ||||
		Reference in New Issue
	
	Block a user