diff --git a/sitegen/site.py b/sitegen/site.py index b7fbf6b..4aaddd3 100644 --- a/sitegen/site.py +++ b/sitegen/site.py @@ -58,13 +58,25 @@ def cut_text(filename, count): soup = BeautifulSoup(html, features="lxml") for script in soup(["script", "style"]): script.extract() - k = [] - for i in soup.findAll("p")[1]: - k.append(i) - b = "".join(str(e) for e in k) - text = html2text(b.replace("\n", " ")) - textreduced = (text[:count] + " [...]") if len(text) > count else (text) - return textreduced + paragraphs = soup.find_all("p") + + # No
tags at all → return empty string + if not paragraphs: + return "" + + # If only one
, use that one; otherwise use the second + target = paragraphs[1] if len(paragraphs) > 1 else paragraphs[0] + + # Convert contents of the
to HTML string + b = "".join(str(e) for e in target.contents) + + # Convert to text + text = html2text(b.replace("\n", " ")).strip() + + # Truncate + if len(text) > count: + return text[:count] + " [...]" + return text def extract_body(text, content_id="newspost-content"):