Add a work-around for buggy wordpress (urls encoded wrong).

2019-03-30 23:34:55 +01:00
parent ae78c8c16f
commit d2e57bbc27
1 changed files with 17 additions and 2 deletions
--- a/feediverse.py
+++ b/feediverse.py
@ -2,6 +2,7 @@
 import os
 import sys
 import codecs
 import argparse
 import yaml
 import dateutil
@ -17,6 +18,14 @@ DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
 http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
 # encoding error-handler for buggy wordpress urls
 def __urlencodereplace_errors(exc):
    bs = exc.object[exc.start:exc.end].encode("utf-8")
    bs = b"".join(b'%%%X' % b for b in bs)
    return (bs, exc.end)
 codecs.register_error("urlencodereplace", __urlencodereplace_errors)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
@ -90,7 +99,7 @@ def get_feed(feed_url, last_update):
        yield get_entry(entry, generator)
    return new_entries
-def collect_images(entry):
+def collect_images(entry, generator=None):
    def find_urls(part):
        if not part:
@ -113,6 +122,12 @@ def collect_images(entry):
        if (e["type"].startswith(("image/", "video/")) and
            e["href"] not in urls):
            urls.append(e["href"])
    if generator == "wordpress":
        # Work around a wordpress bug: If the filename contains an
        # umlaut, this will not be encoded using %-escape, as the
        # standard demands. This will break encoding in http.request()
        urls = (u.encode("ascii", "urlencodereplace").decode()
                for u in urls)
    images = []
    for url in urls:
        resp = http.request('GET', url, preload_content=False)
@ -145,7 +160,7 @@ def get_entry(entry, generator=None):
        'content': BeautifulSoup(summary, 'html.parser').get_text(),
        'hashtags': ' '.join(hashtags),
        'updated': dateutil.parser.parse(entry['updated']),
-        'images': collect_images(entry),
+        'images': collect_images(entry, generator),
    }
 def setup(config_file):