merged and resolved a few conflicts #8

2020-09-29 10:51:29 -04:00
parent b5ec046f87 2748ac0da6
commit 46b46ca875
3 changed files with 203 additions and 27 deletions
--- a/README.md
+++ b/README.md
@ -18,6 +18,9 @@ Once *feediverse* is configured you can add it to your crontab:

    */15 * * * * /usr/local/bin/feediverse    

+Run `feediverse --help` to show the comand line options.
+
+
 ## Post Format

 You can customize the post format by opening the configuration file (default is
@ -34,6 +37,11 @@ like so:
 `{hashtags}` will look for tags in the feed entry and turn them into a space
 separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{link}` instead of `{url}`.

+`{content}` is the whole content of the feed entry (with html-tags
+stripped). Please be aware that this might easily exceed Mastodon's
+limit of 512 characters.
+
+
 ## Multiple Feeds

 Since *feeds* is a list you can add additional feeds to watch if you want.
@ -44,6 +52,23 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
        template: "dot com: {title} {url}"
      - url: https://example.org/feed/
        template: "dot org: {title} {url}"
+        generator: wordpress
+
+
+## Special Handling for Different Feed Generators
+
+*feediverse* has support for some special cases of some feed
+generators. For example detecting the entries perma-link. Currently
+only Wordpress is handled, but others may follow.
+
+If a feed does not provide a proper *generator* entry, you can set it
+by adding a `generator:` value to the feed's configuration. See the
+seconds one in the example above.
+
+You can check whether feed provides a *generator* entry like this:
+
+  feediverse --verbose --dry-run feedverse-test.rc | grep generator
+

 ## Why?

--- a/feediverse.py
+++ b/feediverse.py
@ -2,18 +2,41 @@

 import os
 import sys
+import codecs
 import argparse
 import yaml
 import dateutil
 import feedparser
+from bs4 import BeautifulSoup

 from mastodon import Mastodon
-from datetime import datetime, timezone
+from datetime import datetime, timezone, MINYEAR
+import urllib3
+import re
+
+
+DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
+MAX_IMAGES = 4  # Mastodon allows attaching 4 images max.
+
+http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
+
+# encoding error-handler for buggy wordpress urls
+def __urlencodereplace_errors(exc):
+    bs = exc.object[exc.start:exc.end].encode("utf-8")
+    bs = b"".join(b'%%%X' % b for b in bs)
+    return (bs, exc.end)
+codecs.register_error("urlencodereplace", __urlencodereplace_errors)
+

 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")

 def main():
    parser = argparse.ArgumentParser()
+    parser.add_argument("-n", "--dry-run", action="store_true",
+                        help=("perform a trial run with no changes made: "
+                              "don't toot, don't save config"))
+    parser.add_argument("-v", "--verbose", action="store_true",
+                        help="be verbose")
    parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
                        help=("config file to use, default: %s" %
                              DEFAULT_CONFIG_FILE),
@ -33,57 +56,175 @@ def main():
        access_token=config['access_token']
    )

+    newest_post = config['updated']
    for feed in config['feeds']:
-        for entry in get_feed(feed['url'], config['updated']):
-            masto.status_post(feed['template'].format(**entry)[0:49999999999])
+        for entry in get_feed(feed['url'], config['updated'],
+                              config['include_images'],
+                              generator=feed.get('generator')):
+            newest_post = max(newest_post, entry['updated'])
+            if args.verbose:
+                try:
+                    print(entry)
+                except UnicodeEncodeError:
+                    # work-around for non-unicode terminals
+                    print(dict(
+                        (k, v.encode("utf-8") if hasattr(v, "encode") else v)
+                        for k, v in entry.items()))
+            if args.dry_run:
+                print("trial run, not tooting ", entry["title"][:50])
+                continue
+            media_ids = []
+            for img in entry.get("images", []):
+                media = masto.media_post(img, img.headers['content-type'])
+                img.release_conn()  # deferred from collect_images()
+                if not 'error' in media:
+                    media_ids.append(media)
+            entry.pop("images", None)
+            masto.status_post(feed['template'].format(**entry)[:49999999999],
+                              media_ids=media_ids)

-    save_config(config, config_file)
+    config['updated'] = newest_post.isoformat()
+    if args.dry_run:
+        print("trial run, not saving the config")
+    else:
+        if args.verbose:
+            print("saving the config")
+        save_config(config, config_file)

 def save_config(config, config_file):
    copy = dict(config)
-    copy['updated'] = datetime.now(tz=timezone.utc).isoformat()
    with open(config_file, 'w') as fh:
        fh.write(yaml.dump(copy, default_flow_style=False))

 def read_config(config_file):
-    config = {}
+    config = {
+        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
+        'include_images': False,
+    }
    with open(config_file) as fh:
-        config = yaml.load(fh)
-        if 'updated' in config:
-            config['updated'] = dateutil.parser.parse(config['updated'])
-        else:
-            config['updated'] = datetime.now(tz=timezone.utc)
+        cfg = yaml.load(fh, yaml.SafeLoader)
+        if 'updated' in cfg:
+            cfg['updated'] = dateutil.parser.parse(cfg['updated'])
+    config.update(cfg)
    return config

-def get_feed(feed_url, last_update):
+def detect_generator(feed):
+    # For RSS the generator tag holds the URL, while for ATOM it holds the name
+    generator = feed.feed.get("generator", "")
+    if "/wordpress.org/" in generator:
+        return "wordpress"
+    elif "wordpress" == generator.lower():
+        return "wordpress"
+    return None
+
+def get_feed(feed_url, last_update, include_images, generator=None):
    new_entries = 0
    feed = feedparser.parse(feed_url)
-    feed.entries.sort(key=lambda e: e.published_parsed)
-    for entry in feed.entries:
-        e = get_entry(entry)
-        if last_update is None or e['updated'] > last_update:
-            new_entries += 1
-            yield e
+    if last_update:
+        entries = [e for e in feed.entries
+                   if dateutil.parser.parse(e['updated']) > last_update]
+    else:
+        entries = feed.entries
+    entries.sort(key=lambda e: e.published_parsed)
+    generator = generator or detect_generator(feed)
+    for entry in entries:
+        new_entries += 1
+        yield get_entry(entry, include_images, generator)
    return new_entries

-def get_entry(entry):
+def collect_images(entry, generator=None):
+
+    def find_urls(part):
+        if not part:
+            return
+        soup = BeautifulSoup(part, 'html.parser')
+        for tag in soup.find_all(["a", "img"]):
+            if tag.name == "a":
+                url = tag["href"]
+            elif tag.name == "img":
+                url = tag["src"]
+            if url not in urls:
+                urls.append(url)
+
+    urls = []
+    find_urls(entry.get("summary", ""))
+    for c in entry.get("content", []):
+        find_urls(c.value)
+    for e in (entry.enclosures
+              + [l for l in entry.links if l.get("rel") == "enclosure"]):
+        if (e["type"].startswith(("image/", "video/")) and
+            e["href"] not in urls):
+            urls.append(e["href"])
+    if generator == "wordpress":
+        urls = (u for u in urls if not "/wp-content/plugins/" in u)
+        # Work around a wordpress bug: If the filename contains an
+        # umlaut, this will not be encoded using %-escape, as the
+        # standard demands. This will break encoding in http.request()
+        urls = (u.encode("ascii", "urlencodereplace").decode()
+                for u in urls)
+    images = []
+    for url in urls:
+        resp = http.request('GET', url, preload_content=False)
+        if resp.headers['content-type'].startswith(("image/", "video/")):
+            images.append(resp)
+            # IMPORTANT: Need to release_conn() later!
+            if len(images) >= MAX_IMAGES:
+                break
+        else:
+            resp.release_conn()
+    return images
+
+
+def get_entry(entry, include_images, generator=None):
+
+    def cleanup(text):
+        html = BeautifulSoup(text, 'html.parser')
+        # Remove all elements of class read-more or read-more-*
+        for more in html.find_all(None, re.compile("^read-more($|-.*)")):
+            more.extract()
+        text = html.get_text()
+        text = re.sub('\xa0+', ' ', text)
+        text = re.sub('  +', ' ', text)
+        text = re.sub(' +\n', '\n', text)
+        text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
+        return text.strip()
+
    hashtags = []
    for tag in entry.get('tags', []):
-        for t in tag['term'].split(' '):
-            hashtags.append('#{}'.format(t))
+        for t in tag['term'].split():
+            hashtags.append('#' + t)
+    summary = entry.get('summary', '')
+    content = entry.get('content', '') or ''
+    if content:
+        content = cleanup(content[0].get('value', ''))
+    url = entry.id
+    if generator == "wordpress":
+        links = [l for l in entry.links if l.get("rel") == "alternate"]
+        if len(links) > 1:
+            links = [l for l in entry.links if l.get("type") == "text/html"]
+        if links:
+            url = links[0]["href"]
    return {
-        'url': entry.id,
+        'url': url,
        'link': entry.link,
-        'title': entry.title,
-        'summary': entry.get('summary', ''),
+        'title': cleanup(entry.title),
+        'summary': cleanup(summary),
+        'content': content,
        'hashtags': ' '.join(hashtags),
        'updated': dateutil.parser.parse(entry['updated']),
+        'images': collect_images(entry, generator) if include_images else [],
+        '__generator__': generator,
    }

 def setup(config_file):
+
+    def yes_no(question):
+        res = input(question + ' [y/n] ')
+        return res.lower() in "y1"
+
    url = input('What is your Mastodon Instance URL? ')
-    have_app = input('Do you have your app credentials already? [y/n] ')
-    if have_app.lower() == 'y':
+    have_app = yes_no('Do you have your app credentials already?')
+    if have_app:
        name = 'feediverse'
        client_id = input('What is your app\'s client id: ')
        client_secret = input('What is your client secret: ')
@ -103,16 +244,21 @@ def setup(config_file):
        access_token = m.log_in(username, password)

    feed_url = input('RSS/Atom feed URL to watch: ')
+    old_posts = yes_no('Shall already existing entries be tooted, too?')
+    include_images = yes_no('Shall images be included in the toot?')
    config = {
        'name': name,
        'url': url,
        'client_id': client_id,
        'client_secret': client_secret,
        'access_token': access_token,
+        'include_images': include_images,
        'feeds': [
            {'url': feed_url, 'template': '{title} {url}'}
        ]
    }
+    if not old_posts:
+        config['updated'] = datetime.now(tz=timezone.utc).isoformat()
    save_config(config, config_file)
    print("")
    print("Your feediverse configuration has been saved to {}".format(config_file))
--- a/setup.py
+++ b/setup.py
@ -14,6 +14,11 @@ setup(
    description='Connect an RSS Feed to Mastodon',
    long_description=long_description,
    long_description_content_type="text/markdown",
-    install_requires=['feedparser', 'mastodon.py', 'python-dateutil', 'pyyaml'],
+    install_requires=['beautifulsoup4',
+                      'feedparser',
+                      'mastodon.py',
+                      'python-dateutil',
+                      'pyyaml',
+                      'urllib3[secure]'],
    entry_points={'console_scripts': ['feediverse = feediverse:main']}
 )