| import urllib.parse |
| from bs4 import BeautifulSoup |
| from crawl4ai import AsyncWebCrawler |
| import re |
| import asyncio |
|
|
| def process_url(url, sub_url): |
| return urllib.parse.urljoin(url, sub_url) |
|
|
|
|
| def clean_markdown(res): |
| pattern = r'\[.*?\]\(.*?\)' |
| try: |
| |
| result = re.sub(pattern, '', res) |
| url_pattern = pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' |
| result = re.sub(url_pattern, '', result) |
| result = result.replace("* \n","") |
| result = re.sub(r"\n\n+", "\n", result) |
| return result |
| except Exception: |
| return res |
|
|
| async def get_info(url, screentshot = True) -> str: |
| async with AsyncWebCrawler() as crawler: |
| if screentshot: |
| result = await crawler.arun(url, screenshot=screentshot) |
| |
| return result.html, clean_markdown(result.markdown), result.screenshot |
| else: |
| result = await crawler.arun(url, screenshot=screentshot) |
| return result.html, clean_markdown(result.markdown) |
| |
| if __name__ == "__main__": |
| asyncio.run(get_info("https://2024.aclweb.org/")) |