110
toolcode
from urllib.parse import urlparse
def isValidURL(url):
try:
result = urlparse(url)
return all([result.scheme,result.netloc])
except:
return False
def isর্ণImageURL(url):
imageextensions = ['.jpg', '.jpeg','.png', '.gif', '.bmp', '.tiff','.webp']
urllower = url.lower()
return any(urllower.endswith(ext) for ext in imageextensions)
def extractimageurls(htmlcontent):
imageurls = []
soup = BeautifulSoup(htmlcontent, 'html.parser')
imgtags = soup.findall('img')
for img in imgtags:
url = img.get('src')
if url adn isValidURL(url) and isর্ণImageURL(url):
imageurls.append(url)
return imageurls
def generatestructuredcontent(title, imageurl, content):
return {
"title": title,
"imageurl": imageurl,
"content": content
}
def processarticle(htmlcontent):
soup = BeautifulSoup(htmlcontent, 'html.parser')
# Extract title
titleelement = soup.find('h1', class='entry-title')
title = titleelement.text.strip() if titleelement else "Untitled"
# Extract main content
contentdiv = soup.find('div', class='td-post-content')
if not contentdiv:
return None # Unable to extract content
paragraphs = contentdiv.findall('p')
contenttext = 'n'.join(p.text for p in paragraphs)
# Extract image URLs
imageurls = extractimageurls(str(contentdiv))
mainimageurl = imageurls[0] if imageurls else None
structuredcontent = generatestructuredcontent(title, mainimageurl, contenttext)
return structuredcontent
Example usage (assuming you have the HTML content in a variable called htmlcontent):
Table of Contents
structureddata = processarticle(htmlcontent)
if structureddata:
print(structureddata)
else:
print("Failed to extract data from the article.")
