Add cleaning up white-space in fetched texts.
This commit is contained in:
@ -12,6 +12,7 @@ from bs4 import BeautifulSoup
|
|||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
from datetime import datetime, timezone, MINYEAR
|
from datetime import datetime, timezone, MINYEAR
|
||||||
import urllib3
|
import urllib3
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
|
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
|
||||||
@ -164,6 +165,15 @@ def collect_images(entry, generator=None):
|
|||||||
|
|
||||||
|
|
||||||
def get_entry(entry, include_images, generator=None):
|
def get_entry(entry, include_images, generator=None):
|
||||||
|
|
||||||
|
def cleanup(text):
|
||||||
|
text = BeautifulSoup(text, 'html.parser').get_text()
|
||||||
|
text = re.sub('\xa0+', ' ', text)
|
||||||
|
text = re.sub(' +', ' ', text)
|
||||||
|
text = re.sub(' +\n', '\n', text)
|
||||||
|
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
hashtags = []
|
hashtags = []
|
||||||
for tag in entry.get('tags', []):
|
for tag in entry.get('tags', []):
|
||||||
for t in tag['term'].split():
|
for t in tag['term'].split():
|
||||||
@ -171,7 +181,7 @@ def get_entry(entry, include_images, generator=None):
|
|||||||
summary = entry.get('summary', '')
|
summary = entry.get('summary', '')
|
||||||
content = entry.get('content', '') or ''
|
content = entry.get('content', '') or ''
|
||||||
if content:
|
if content:
|
||||||
content = content[0].get('value', '')
|
content = cleanup(content[0].get('value', ''))
|
||||||
url = entry.id
|
url = entry.id
|
||||||
if generator == "wordpress":
|
if generator == "wordpress":
|
||||||
links = [l for l in entry.links if l.get("rel") == "alternate"]
|
links = [l for l in entry.links if l.get("rel") == "alternate"]
|
||||||
@ -181,9 +191,9 @@ def get_entry(entry, include_images, generator=None):
|
|||||||
url = links[0]["href"]
|
url = links[0]["href"]
|
||||||
return {
|
return {
|
||||||
'url': url,
|
'url': url,
|
||||||
'title': BeautifulSoup(entry.title, 'html.parser').get_text(),
|
'title': cleanup(entry.title),
|
||||||
'summary': BeautifulSoup(summary, 'html.parser').get_text(),
|
'summary': cleanup(summary),
|
||||||
'content': BeautifulSoup(content, 'html.parser').get_text(),
|
'content': content,
|
||||||
'hashtags': ' '.join(hashtags),
|
'hashtags': ' '.join(hashtags),
|
||||||
'updated': dateutil.parser.parse(entry['updated']),
|
'updated': dateutil.parser.parse(entry['updated']),
|
||||||
'images': collect_images(entry, generator) if include_images else [],
|
'images': collect_images(entry, generator) if include_images else [],
|
||||||
|
|||||||
Reference in New Issue
Block a user