Add cleaning up white-space in fetched texts.

This commit is contained in:
Hartmut Goebel
2019-04-23 22:37:57 +02:00
parent 09a3588f71
commit 9e1a94d4ca

View File

@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
from mastodon import Mastodon from mastodon import Mastodon
from datetime import datetime, timezone, MINYEAR from datetime import datetime, timezone, MINYEAR
import urllib3 import urllib3
import re
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse") DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
@ -164,6 +165,15 @@ def collect_images(entry, generator=None):
def get_entry(entry, include_images, generator=None): def get_entry(entry, include_images, generator=None):
def cleanup(text):
text = BeautifulSoup(text, 'html.parser').get_text()
text = re.sub('\xa0+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub(' +\n', '\n', text)
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
return text.strip()
hashtags = [] hashtags = []
for tag in entry.get('tags', []): for tag in entry.get('tags', []):
for t in tag['term'].split(): for t in tag['term'].split():
@ -171,7 +181,7 @@ def get_entry(entry, include_images, generator=None):
summary = entry.get('summary', '') summary = entry.get('summary', '')
content = entry.get('content', '') or '' content = entry.get('content', '') or ''
if content: if content:
content = content[0].get('value', '') content = cleanup(content[0].get('value', ''))
url = entry.id url = entry.id
if generator == "wordpress": if generator == "wordpress":
links = [l for l in entry.links if l.get("rel") == "alternate"] links = [l for l in entry.links if l.get("rel") == "alternate"]
@ -181,9 +191,9 @@ def get_entry(entry, include_images, generator=None):
url = links[0]["href"] url = links[0]["href"]
return { return {
'url': url, 'url': url,
'title': BeautifulSoup(entry.title, 'html.parser').get_text(), 'title': cleanup(entry.title),
'summary': BeautifulSoup(summary, 'html.parser').get_text(), 'summary': cleanup(summary),
'content': BeautifulSoup(content, 'html.parser').get_text(), 'content': content,
'hashtags': ' '.join(hashtags), 'hashtags': ' '.join(hashtags),
'updated': dateutil.parser.parse(entry['updated']), 'updated': dateutil.parser.parse(entry['updated']),
'images': collect_images(entry, generator) if include_images else [], 'images': collect_images(entry, generator) if include_images else [],