Remove image downloading

The special casing of Wordpress and image downloading was not reliable for me so I have removed it, and tried to simplify the code in the process. If you still need this functionality you will want to pin v0.2.1.
2022-02-19 13:38:12 +00:00
parent f3daed0bfb
commit e9d58c95be
4 changed files with 58 additions and 189 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)
-Copyright (c) 2018 Ed Summers
+Copyright (c) Ed Summers
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@ -1,4 +1,6 @@
-*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts. Please use responsibly! *feediverse* is kind of the same thing as [feed2toot] or [rss-to-activitypub](https://github.com/dariusk/rss-to-activitypub/) but it's just one module that works with Python 3, and I was bored.
+*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts.
 It's meant to add a little bit of spice to your timeline from other places.
 Please use it responsibly.
 ## Install
@ -18,7 +20,6 @@ Once *feediverse* is configured you can add it to your crontab:
 Run `feediverse --help` to show the command line options.
 ## Post Format
 You can customize the post format by opening the configuration file (default is
@ -39,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l
 stripped). Please be aware that this might easily exceed Mastodon's
 limit of 512 characters.
 ## Multiple Feeds
 Since *feeds* is a list you can add additional feeds to watch if you want.
@ -50,36 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
        template: "dot com: {title} {url}"
      - url: https://example.org/feed/
        template: "dot org: {title} {url}"
        generator: wordpress
 ## Special Handling for Different Feed Generators
 *feediverse* has support for some special cases of some feed
 generators. For example detecting the entries perma-link. Currently
 only Wordpress is handled, but others may follow.
 If a feed does not provide a proper *generator* entry, you can set it
 by adding a `generator:` value to the feed's configuration. See the
 seconds one in the example above.
 You can check whether feed provides a *generator* entry like this:
    feediverse --verbose --dry-run feedverse-test.rc | grep generator
 ## Why?
 I created *feediverse* because I wanted to send my Pinboard bookmarks to
 Mastodon.  I've got an IFTTT recipe that does this for Twitter, but IFTTT
 doesn't appear to work with Mastodon yet. That being said *feediverse* should
 work with any RSS or Atom feed (thanks to [feedparser]).
 ## Warning!
 Please use responsibly. Don't fill up Mastodon with tons of junk just because
 you can. That kind of toxic behavior is why a lot of people are trying to
 establish other forms of social media like Mastodon.
 [feed2toot]: https://gitlab.com/chaica/feed2toot/
 [feedparser]: http://feedparser.org/
--- a/feediverse.py
+++ b/feediverse.py
@ -4,9 +4,7 @@ import os
 import re
 import sys
 import yaml
 import codecs
 import argparse
 import urllib3
 import dateutil
 import feedparser
@ -14,19 +12,7 @@ from bs4 import BeautifulSoup
 from mastodon import Mastodon
 from datetime import datetime, timezone, MINYEAR
 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
 MAX_IMAGES = 4  # Mastodon allows attaching 4 images max.
 http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
 # encoding error-handler for buggy wordpress urls
 def __urlencodereplace_errors(exc):
    bs = exc.object[exc.start:exc.end].encode("utf-8")
    bs = b"".join(b'%%%X' % b for b in bs)
    return (bs, exc.end)
 codecs.register_error("urlencodereplace", __urlencodereplace_errors)
 def main():
    parser = argparse.ArgumentParser()
@ -61,67 +47,20 @@ def main():
    for feed in config['feeds']:
        if args.verbose:
            print(f"fetching {feed['url']} entries since {config['updated']}")
-        for entry in get_feed(feed['url'], config['updated'],
+        for entry in get_feed(feed['url'], config['updated']):
                              config['include_images'],
                              generator=feed.get('generator')):
            newest_post = max(newest_post, entry['updated'])
            if args.verbose:
                try:
                print(entry)
                except UnicodeEncodeError:
                    # work-around for non-unicode terminals
                    print(dict(
                        (k, v.encode("utf-8") if hasattr(v, "encode") else v)
                        for k, v in entry.items()))
            if args.dry_run:
                print("trial run, not tooting ", entry["title"][:50])
                continue
-            media_ids = []
+            masto.status_post(feed['template'].format(**entry)[:499])
            for img in entry.get("images", []):
                media = masto.media_post(img, img.headers['content-type'])
                img.release_conn()  # deferred from collect_images()
                if not 'error' in media:
                    media_ids.append(media)
            entry.pop("images", None)
            masto.status_post(feed['template'].format(**entry)[:499],
                              media_ids=media_ids)
    if not args.dry_run:
        config['updated'] = newest_post.isoformat()
    if args.dry_run:
        print("trial run, not saving the config")
    else:
        if args.verbose:
            print("saving the config", config_file)
        save_config(config, config_file)
-def save_config(config, config_file):
+def get_feed(feed_url, last_update):
    copy = dict(config)
    with open(config_file, 'w') as fh:
        fh.write(yaml.dump(copy, default_flow_style=False))
 def read_config(config_file):
    config = {
        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
        'include_images': False,
    }
    with open(config_file) as fh:
        cfg = yaml.load(fh, yaml.SafeLoader)
        if 'updated' in cfg:
            cfg['updated'] = dateutil.parser.parse(cfg['updated'])
    config.update(cfg)
    return config
 def detect_generator(feed):
    # For RSS the generator tag holds the URL, while for ATOM it holds the name
    generator = feed.feed.get("generator", "")
    if "/wordpress.org/" in generator:
        return "wordpress"
    elif "wordpress" == generator.lower():
        return "wordpress"
    return None
 def get_feed(feed_url, last_update, include_images, generator=None):
    new_entries = 0
    feed = feedparser.parse(feed_url)
    if last_update:
        entries = [e for e in feed.entries
@ -129,73 +68,10 @@ def get_feed(feed_url, last_update, include_images, generator=None):
    else:
        entries = feed.entries
    entries.sort(key=lambda e: e.updated_parsed)
    generator = generator or detect_generator(feed)
    for entry in entries:
-        new_entries += 1
+        yield get_entry(entry)
        yield get_entry(entry, include_images, generator)
    return new_entries
 def collect_images(entry, generator=None):
    def find_urls(part):
        if not part:
            return
        soup = BeautifulSoup(part, 'html.parser')
        for tag in soup.find_all(["a", "img"]):
            if tag.name == "a":
                url = tag.get("href")
            elif tag.name == "img":
                url = tag.get("src")
            if url and url not in urls:
                urls.append(url)
    urls = []
    find_urls(entry.get("summary", ""))
    for c in entry.get("content", []):
        find_urls(c.value)
    for e in (entry.enclosures
              + [l for l in entry.links if l.get("rel") == "enclosure"]):
        if (e["type"].startswith(("image/", "video/")) and
            e["href"] not in urls):
            urls.append(e["href"])
    if generator == "wordpress":
        urls = (u for u in urls if not "/wp-content/plugins/" in u)
        # Work around a wordpress bug: If the filename contains an
        # umlaut, this will not be encoded using %-escape, as the
        # standard demands. This will break encoding in http.request()
        urls = (u.encode("ascii", "urlencodereplace").decode()
                for u in urls)
    images = []
    for url in urls:
        try:
            resp = http.request('GET', url, preload_content=False)
            if resp.headers.get('content-type', '').startswith(("image/", "video/")):
                images.append(resp)
                # IMPORTANT: Need to release_conn() later!
                if len(images) >= MAX_IMAGES:
                    break
            else:
                resp.release_conn()
        except urllib3.exceptions.HTTPError:
            # ignore http errors, maybe they should be logged?
            pass
    return images
 def get_entry(entry, include_images, generator=None):
    def cleanup(text):
        html = BeautifulSoup(text, 'html.parser')
        # Remove all elements of class read-more or read-more-*
        for more in html.find_all(None, re.compile("^read-more($|-.*)")):
            more.extract()
        text = html.get_text()
        text = re.sub('\xa0+', ' ', text)
        text = re.sub('  +', ' ', text)
        text = re.sub(' +\n', '\n', text)
        text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
        return text.strip()
 def get_entry(entry):
    hashtags = []
    for tag in entry.get('tags', []):
        t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
@ -205,14 +81,6 @@ def get_entry(entry, include_images, generator=None):
    if content:
        content = cleanup(content[0].get('value', ''))
    url = entry.id
    if generator == "wordpress":
        links = [l for l in entry.links if l.get("rel") == "alternate"]
        if len(links) > 1:
            links = [l for l in entry.links if l.get("type") == "text/html"]
        if links:
            url = links[0]["href"]
        t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
        hashtags.append('#{}'.format(t))
    return {
        'url': url,
        'link': entry.link,
@ -220,17 +88,53 @@ def get_entry(entry, include_images, generator=None):
        'summary': cleanup(summary),
        'content': content,
        'hashtags': ' '.join(hashtags),
-        'updated': dateutil.parser.parse(entry['updated']),
+        'updated': dateutil.parser.parse(entry['updated'])
        'images': collect_images(entry, generator) if include_images else [],
        '__generator__': generator,
    }
-def setup(config_file):
+def cleanup(text):
    html = BeautifulSoup(text, 'html.parser')
    text = html.get_text()
    text = re.sub('\xa0+', ' ', text)
    text = re.sub('  +', ' ', text)
    text = re.sub(' +\n', '\n', text)
    text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
    return text.strip()
 def find_urls(html):
    if not html:
        return
    urls = []
    soup = BeautifulSoup(html, 'html.parser')
    for tag in soup.find_all(["a", "img"]):
        if tag.name == "a":
            url = tag.get("href")
        elif tag.name == "img":
            url = tag.get("src")
        if url and url not in urls:
            urls.append(url)
    return urls
 def yes_no(question):
    res = input(question + ' [y/n] ')
    return res.lower() in "y1"
 def save_config(config, config_file):
    copy = dict(config)
    with open(config_file, 'w') as fh:
        fh.write(yaml.dump(copy, default_flow_style=False))
 def read_config(config_file):
    config = {
        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc)
    }
    with open(config_file) as fh:
        cfg = yaml.load(fh, yaml.SafeLoader)
        if 'updated' in cfg:
            cfg['updated'] = dateutil.parser.parse(cfg['updated'])
    config.update(cfg)
    return config
 def setup(config_file):
    url = input('What is your Mastodon Instance URL? ')
    have_app = yes_no('Do you have your app credentials already?')
    if have_app:
@ -254,14 +158,12 @@ def setup(config_file):
    feed_url = input('RSS/Atom feed URL to watch: ')
    old_posts = yes_no('Shall already existing entries be tooted, too?')
    include_images = yes_no('Shall images be included in the toot?')
    config = {
        'name': name,
        'url': url,
        'client_id': client_id,
        'client_secret': client_secret,
        'access_token': access_token,
        'include_images': include_images,
        'feeds': [
            {'url': feed_url, 'template': '{title} {url}'}
        ]
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ with open("README.md") as f:
 setup(
    name='feediverse',
-    version='0.2.2',
+    version='0.3.0',
    python_requires='>=3.3',
    url='https://github.com/edsu/feediverse',
    author='Ed Summers',
@ -18,7 +18,6 @@ setup(
                      'feedparser',
                      'mastodon.py',
                      'python-dateutil',
-                      'pyyaml',
+                      'pyyaml'],
                      'urllib3[secure]'],
    entry_points={'console_scripts': ['feediverse = feediverse:main']}
 )