Enhance cleanup of fetched texts.
Remove all HTML-elements with a class "read-more" or a class matching "read-more-*". This will remove the "Read More".
This commit is contained in:
@ -175,7 +175,11 @@ def collect_images(entry, generator=None):
|
|||||||
def get_entry(entry, include_images, generator=None):
|
def get_entry(entry, include_images, generator=None):
|
||||||
|
|
||||||
def cleanup(text):
|
def cleanup(text):
|
||||||
text = BeautifulSoup(text, 'html.parser').get_text()
|
html = BeautifulSoup(text, 'html.parser')
|
||||||
|
# Remove all elements of class read-more or read-more-*
|
||||||
|
for more in html.find_all(None, re.compile("^read-more($|-.*)")):
|
||||||
|
more.extract()
|
||||||
|
text = html.get_text()
|
||||||
text = re.sub('\xa0+', ' ', text)
|
text = re.sub('\xa0+', ' ', text)
|
||||||
text = re.sub(' +', ' ', text)
|
text = re.sub(' +', ' ', text)
|
||||||
text = re.sub(' +\n', '\n', text)
|
text = re.sub(' +\n', '\n', text)
|
||||||
|
|||||||
Reference in New Issue
Block a user