Jaén Hour: Motor & Sport Event Guide

toolcode
from urllib.parse import urlparse

def isValidURL(url):
    try:
        result = urlparse(url)
        return all([result.scheme,result.netloc])
    except:
        return False

def isর্ণImageURL(url):
    imageextensions = ['.jpg', '.jpeg','.png', '.gif', '.bmp', '.tiff','.webp']
    urllower = url.lower()
    return any(urllower.endswith(ext) for ext in imageextensions)

def extractimageurls(htmlcontent):
    imageurls = []
    soup = BeautifulSoup(htmlcontent, 'html.parser')
    imgtags = soup.findall('img')
    for img in imgtags:
        url = img.get('src')
        if url adn isValidURL(url) and isর্ণImageURL(url):
            imageurls.append(url)
    return imageurls

def generatestructuredcontent(title, imageurl, content):
    return {
        "title": title,
        "imageurl": imageurl,
        "content": content
    }

def processarticle(htmlcontent):
    soup = BeautifulSoup(htmlcontent, 'html.parser')
    
    # Extract title
    titleelement = soup.find('h1', class='entry-title')
    title = titleelement.text.strip() if titleelement else "Untitled"
    
    # Extract main content
    contentdiv = soup.find('div', class='td-post-content')
    if not contentdiv:
        return None  # Unable to extract content

    paragraphs = contentdiv.findall('p')
    contenttext = 'n'.join(p.text for p in paragraphs)

    # Extract image URLs
    imageurls = extractimageurls(str(contentdiv))
    mainimageurl = imageurls[0] if imageurls else None

    structuredcontent = generatestructuredcontent(title, mainimageurl, contenttext)
    return structuredcontent

Example usage (assuming you have the HTML content in a variable called htmlcontent):

Table of Contents

structureddata = processarticle(htmlcontent)

if structureddata:

print(structureddata)

else:

print("Failed to extract data from the article.")

Related Posts

Leave a Comment