Next scheduled rescrape ... never
26/06/2025, 05:35:43 UTC POST DELETED
Original archived Re: My AI experiment on the forum
Scraped on 19/06/2025, 05:35:52 UTC
But the problem is that patrol only show new posts, and if we want to monitorate the full forum we should filter the replies on the threads too, which is complex because the RSS is blocked and if we abuse the forum calls to verify the different posts we will get blocked by the server, so, if someone has a way to get all the new posts on all the forum that would help to build the right patrol tool.

As linus torvalds say... Talk is cheap, show me the code.

Code:
from bs4 import BeautifulSoup
import json
import sys
import requests
import re

def extract_after(text, key):
    try:
        return text.split(key)[1].split()[0]
    except:
        return None

def parse_quote_header(header_text):
    match = re.search(r"Quote from:\s*(.+?)\s+on\s+(.*)", header_text)
    if match:
        return match.group(1).strip(), match.group(2).strip()
    return None, None

def extract_user_profiles(soup):
    profiles = {}
    for td in soup.find_all("td", class_="poster_info"):
        a = td.find("a")
        if a:
            name = a.text.strip()
            href = a.get("href")
            profiles[name] = href
    return profiles

def extract_quotes_recursive(container, user_profiles):
    quotes = []
    headers = container.find_all("div", class_="quoteheader", recursive=False)

    for header in headers:
        quote = {}
        link_tag = header.find("a")
        quote["link"] = link_tag["href"] if link_tag else None
        user, date = parse_quote_header(header.get_text(strip=True))

        quote["author"] = user
        quote["profile_url"] = user_profiles.get(user, None)
        quote["date"] = date

        quote_block = header.find_next_sibling("div", class_="quote")
        if quote_block:
            quote["quotes"] = extract_quotes_recursive(quote_block, user_profiles)
            for q in quote_block.find_all("div", class_="quote", recursive=False):
                q.decompose()
            quote["content"] = quote_block.get_text(strip=True)
            quote_block.decompose()
        else:
            quote["quotes"] = []
            quote["content"] = ""

        header.decompose()
        quotes.append(quote)

    return quotes

def parse_html_posts(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    post_blocks = soup.find_all("td", class_="msgcl1")
    user_profiles = extract_user_profiles(soup)
    posts_data = []

    for block in post_blocks:
        post = {}
        anchor = block.find("a")
        post["message_id"] = anchor.get("name") if anchor else None

        poster_td = block.find("td", class_="poster_info")
        if poster_td:
            user_link = poster_td.find("a")
            post["author"] = user_link.text.strip() if user_link else None
            post["profile_url"] = user_link["href"] if user_link else None

            activity_text = poster_td.get_text()
            post["activity"] = extract_after(activity_text, "Activity:")
            post["merit"] = extract_after(activity_text, "Merit:")

        subject_div = block.find("div", class_="subject")
        post["title"] = subject_div.get_text(strip=True) if subject_div else None

        date_div = subject_div.find_next_sibling("div") if subject_div else None
        post["date"] = date_div.get_text(strip=True) if date_div else None

        post_div = block.find("div", class_="post")
        if post_div:
            post["quotes"] = extract_quotes_recursive(post_div, user_profiles)
            post["content"] = post_div.get_text(strip=True)

        posts_data.append(post)

    return posts_data

def main():
    if len(sys.argv) < 2:
        print("Usage: python3 post_last.py <URL> [output.json]")
        sys.exit(1)

    url = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else "bitcointalk_parsed.json"

    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        posts_json = parse_html_posts(response.text)
        with open(output_path, "w", encoding="utf-8") as outfile:
            json.dump(posts_json, outfile, indent=2, ensure_ascii=False)

        print(f"Success! Saved to {output_path}")

    except requests.RequestException as e:
        print(f"Error fetching URL: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

Output:

https://privatebin.net/?b23d0b444e13d295#5xcuFNDVwzcPdZBjaiJtZoqaijrxUsstVM1G98WycE8z

Run it:

Code:
python3 post_last.py https://bitcointalk.org/index.php?topic=5546497.0 out.json

With this code we can directly parsing any thread link from the forum and get a JSON of the posts and quotes, that would be a nice input for an automated process to filter the content. A tool like this could be a nice base to patrol the forum, with another script we could get the updated threads directly from the board of our interest, then create all the JSONs and feed the AI agent.