merged and resolved a few conflicts #8

This commit is contained in:
Ed Summers
2020-09-29 10:51:29 -04:00
3 changed files with 203 additions and 27 deletions

View File

@ -18,6 +18,9 @@ Once *feediverse* is configured you can add it to your crontab:
*/15 * * * * /usr/local/bin/feediverse */15 * * * * /usr/local/bin/feediverse
Run `feediverse --help` to show the comand line options.
## Post Format ## Post Format
You can customize the post format by opening the configuration file (default is You can customize the post format by opening the configuration file (default is
@ -34,6 +37,11 @@ like so:
`{hashtags}` will look for tags in the feed entry and turn them into a space `{hashtags}` will look for tags in the feed entry and turn them into a space
separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{link}` instead of `{url}`. separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{link}` instead of `{url}`.
`{content}` is the whole content of the feed entry (with html-tags
stripped). Please be aware that this might easily exceed Mastodon's
limit of 512 characters.
## Multiple Feeds ## Multiple Feeds
Since *feeds* is a list you can add additional feeds to watch if you want. Since *feeds* is a list you can add additional feeds to watch if you want.
@ -44,6 +52,23 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
template: "dot com: {title} {url}" template: "dot com: {title} {url}"
- url: https://example.org/feed/ - url: https://example.org/feed/
template: "dot org: {title} {url}" template: "dot org: {title} {url}"
generator: wordpress
## Special Handling for Different Feed Generators
*feediverse* has support for some special cases of some feed
generators. For example detecting the entries perma-link. Currently
only Wordpress is handled, but others may follow.
If a feed does not provide a proper *generator* entry, you can set it
by adding a `generator:` value to the feed's configuration. See the
seconds one in the example above.
You can check whether feed provides a *generator* entry like this:
feediverse --verbose --dry-run feedverse-test.rc | grep generator
## Why? ## Why?

View File

@ -2,18 +2,41 @@
import os import os
import sys import sys
import codecs
import argparse import argparse
import yaml import yaml
import dateutil import dateutil
import feedparser import feedparser
from bs4 import BeautifulSoup
from mastodon import Mastodon from mastodon import Mastodon
from datetime import datetime, timezone from datetime import datetime, timezone, MINYEAR
import urllib3
import re
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
MAX_IMAGES = 4 # Mastodon allows attaching 4 images max.
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
# encoding error-handler for buggy wordpress urls
def __urlencodereplace_errors(exc):
bs = exc.object[exc.start:exc.end].encode("utf-8")
bs = b"".join(b'%%%X' % b for b in bs)
return (bs, exc.end)
codecs.register_error("urlencodereplace", __urlencodereplace_errors)
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-n", "--dry-run", action="store_true",
help=("perform a trial run with no changes made: "
"don't toot, don't save config"))
parser.add_argument("-v", "--verbose", action="store_true",
help="be verbose")
parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE", parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
help=("config file to use, default: %s" % help=("config file to use, default: %s" %
DEFAULT_CONFIG_FILE), DEFAULT_CONFIG_FILE),
@ -33,57 +56,175 @@ def main():
access_token=config['access_token'] access_token=config['access_token']
) )
newest_post = config['updated']
for feed in config['feeds']: for feed in config['feeds']:
for entry in get_feed(feed['url'], config['updated']): for entry in get_feed(feed['url'], config['updated'],
masto.status_post(feed['template'].format(**entry)[0:49999999999]) config['include_images'],
generator=feed.get('generator')):
newest_post = max(newest_post, entry['updated'])
if args.verbose:
try:
print(entry)
except UnicodeEncodeError:
# work-around for non-unicode terminals
print(dict(
(k, v.encode("utf-8") if hasattr(v, "encode") else v)
for k, v in entry.items()))
if args.dry_run:
print("trial run, not tooting ", entry["title"][:50])
continue
media_ids = []
for img in entry.get("images", []):
media = masto.media_post(img, img.headers['content-type'])
img.release_conn() # deferred from collect_images()
if not 'error' in media:
media_ids.append(media)
entry.pop("images", None)
masto.status_post(feed['template'].format(**entry)[:49999999999],
media_ids=media_ids)
config['updated'] = newest_post.isoformat()
if args.dry_run:
print("trial run, not saving the config")
else:
if args.verbose:
print("saving the config")
save_config(config, config_file) save_config(config, config_file)
def save_config(config, config_file): def save_config(config, config_file):
copy = dict(config) copy = dict(config)
copy['updated'] = datetime.now(tz=timezone.utc).isoformat()
with open(config_file, 'w') as fh: with open(config_file, 'w') as fh:
fh.write(yaml.dump(copy, default_flow_style=False)) fh.write(yaml.dump(copy, default_flow_style=False))
def read_config(config_file): def read_config(config_file):
config = {} config = {
'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
'include_images': False,
}
with open(config_file) as fh: with open(config_file) as fh:
config = yaml.load(fh) cfg = yaml.load(fh, yaml.SafeLoader)
if 'updated' in config: if 'updated' in cfg:
config['updated'] = dateutil.parser.parse(config['updated']) cfg['updated'] = dateutil.parser.parse(cfg['updated'])
else: config.update(cfg)
config['updated'] = datetime.now(tz=timezone.utc)
return config return config
def get_feed(feed_url, last_update): def detect_generator(feed):
# For RSS the generator tag holds the URL, while for ATOM it holds the name
generator = feed.feed.get("generator", "")
if "/wordpress.org/" in generator:
return "wordpress"
elif "wordpress" == generator.lower():
return "wordpress"
return None
def get_feed(feed_url, last_update, include_images, generator=None):
new_entries = 0 new_entries = 0
feed = feedparser.parse(feed_url) feed = feedparser.parse(feed_url)
feed.entries.sort(key=lambda e: e.published_parsed) if last_update:
for entry in feed.entries: entries = [e for e in feed.entries
e = get_entry(entry) if dateutil.parser.parse(e['updated']) > last_update]
if last_update is None or e['updated'] > last_update: else:
entries = feed.entries
entries.sort(key=lambda e: e.published_parsed)
generator = generator or detect_generator(feed)
for entry in entries:
new_entries += 1 new_entries += 1
yield e yield get_entry(entry, include_images, generator)
return new_entries return new_entries
def get_entry(entry): def collect_images(entry, generator=None):
def find_urls(part):
if not part:
return
soup = BeautifulSoup(part, 'html.parser')
for tag in soup.find_all(["a", "img"]):
if tag.name == "a":
url = tag["href"]
elif tag.name == "img":
url = tag["src"]
if url not in urls:
urls.append(url)
urls = []
find_urls(entry.get("summary", ""))
for c in entry.get("content", []):
find_urls(c.value)
for e in (entry.enclosures
+ [l for l in entry.links if l.get("rel") == "enclosure"]):
if (e["type"].startswith(("image/", "video/")) and
e["href"] not in urls):
urls.append(e["href"])
if generator == "wordpress":
urls = (u for u in urls if not "/wp-content/plugins/" in u)
# Work around a wordpress bug: If the filename contains an
# umlaut, this will not be encoded using %-escape, as the
# standard demands. This will break encoding in http.request()
urls = (u.encode("ascii", "urlencodereplace").decode()
for u in urls)
images = []
for url in urls:
resp = http.request('GET', url, preload_content=False)
if resp.headers['content-type'].startswith(("image/", "video/")):
images.append(resp)
# IMPORTANT: Need to release_conn() later!
if len(images) >= MAX_IMAGES:
break
else:
resp.release_conn()
return images
def get_entry(entry, include_images, generator=None):
def cleanup(text):
html = BeautifulSoup(text, 'html.parser')
# Remove all elements of class read-more or read-more-*
for more in html.find_all(None, re.compile("^read-more($|-.*)")):
more.extract()
text = html.get_text()
text = re.sub('\xa0+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub(' +\n', '\n', text)
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
return text.strip()
hashtags = [] hashtags = []
for tag in entry.get('tags', []): for tag in entry.get('tags', []):
for t in tag['term'].split(' '): for t in tag['term'].split():
hashtags.append('#{}'.format(t)) hashtags.append('#' + t)
summary = entry.get('summary', '')
content = entry.get('content', '') or ''
if content:
content = cleanup(content[0].get('value', ''))
url = entry.id
if generator == "wordpress":
links = [l for l in entry.links if l.get("rel") == "alternate"]
if len(links) > 1:
links = [l for l in entry.links if l.get("type") == "text/html"]
if links:
url = links[0]["href"]
return { return {
'url': entry.id, 'url': url,
'link': entry.link, 'link': entry.link,
'title': entry.title, 'title': cleanup(entry.title),
'summary': entry.get('summary', ''), 'summary': cleanup(summary),
'content': content,
'hashtags': ' '.join(hashtags), 'hashtags': ' '.join(hashtags),
'updated': dateutil.parser.parse(entry['updated']), 'updated': dateutil.parser.parse(entry['updated']),
'images': collect_images(entry, generator) if include_images else [],
'__generator__': generator,
} }
def setup(config_file): def setup(config_file):
def yes_no(question):
res = input(question + ' [y/n] ')
return res.lower() in "y1"
url = input('What is your Mastodon Instance URL? ') url = input('What is your Mastodon Instance URL? ')
have_app = input('Do you have your app credentials already? [y/n] ') have_app = yes_no('Do you have your app credentials already?')
if have_app.lower() == 'y': if have_app:
name = 'feediverse' name = 'feediverse'
client_id = input('What is your app\'s client id: ') client_id = input('What is your app\'s client id: ')
client_secret = input('What is your client secret: ') client_secret = input('What is your client secret: ')
@ -103,16 +244,21 @@ def setup(config_file):
access_token = m.log_in(username, password) access_token = m.log_in(username, password)
feed_url = input('RSS/Atom feed URL to watch: ') feed_url = input('RSS/Atom feed URL to watch: ')
old_posts = yes_no('Shall already existing entries be tooted, too?')
include_images = yes_no('Shall images be included in the toot?')
config = { config = {
'name': name, 'name': name,
'url': url, 'url': url,
'client_id': client_id, 'client_id': client_id,
'client_secret': client_secret, 'client_secret': client_secret,
'access_token': access_token, 'access_token': access_token,
'include_images': include_images,
'feeds': [ 'feeds': [
{'url': feed_url, 'template': '{title} {url}'} {'url': feed_url, 'template': '{title} {url}'}
] ]
} }
if not old_posts:
config['updated'] = datetime.now(tz=timezone.utc).isoformat()
save_config(config, config_file) save_config(config, config_file)
print("") print("")
print("Your feediverse configuration has been saved to {}".format(config_file)) print("Your feediverse configuration has been saved to {}".format(config_file))

View File

@ -14,6 +14,11 @@ setup(
description='Connect an RSS Feed to Mastodon', description='Connect an RSS Feed to Mastodon',
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'], install_requires=['beautifulsoup4',
'feedparser',
'mastodon.py',
'python-dateutil',
'pyyaml',
'urllib3[secure]'],
entry_points={'console_scripts': ['feediverse = feediverse:main']} entry_points={'console_scripts': ['feediverse = feediverse:main']}
) )