But the problem is that patrol only show new posts, and if we want to monitorate the full forum we should filter the replies on the threads too, which is complex because the RSS is blocked and if we abuse the forum calls to verify the different posts we will get blocked by the server, so, if someone has a way to get all the new posts on all the forum that would help to build the right patrol tool.
As linus torvalds say... Talk is cheap, show me the code.
from bs4 import BeautifulSoup
import json
import sys
import requests
import re
def extract_after(text, key):
try:
return text.split(key)[1].split()[0]
except:
return None
def parse_quote_header(header_text):
match = re.search(r"Quote from:\s*(.+?)\s+on\s+(.*)", header_text)
if match:
return match.group(1).strip(), match.group(2).strip()
return None, None
def extract_user_profiles(soup):
profiles = {}
for td in soup.find_all("td", class_="poster_info"):
a = td.find("a")
if a:
name = a.text.strip()
href = a.get("href")
profiles[name] = href
return profiles
def extract_quotes_recursive(container, user_profiles):
quotes = []
headers = container.find_all("div", class_="quoteheader", recursive=False)
for header in headers:
quote = {}
link_tag = header.find("a")
quote["link"] = link_tag["href"] if link_tag else None
user, date = parse_quote_header(header.get_text(strip=True))
quote["author"] = user
quote["profile_url"] = user_profiles.get(user, None)
quote["date"] = date
quote_block = header.find_next_sibling("div", class_="quote")
if quote_block:
quote["quotes"] = extract_quotes_recursive(quote_block, user_profiles)
for q in quote_block.find_all("div", class_="quote", recursive=False):
q.decompose()
quote["content"] = quote_block.get_text(strip=True)
quote_block.decompose()
else:
quote["quotes"] = []
quote["content"] = ""
header.decompose()
quotes.append(quote)
return quotes
def parse_html_posts(html_content):
soup = BeautifulSoup(html_content, "html.parser")
post_blocks = soup.find_all("td", class_="msgcl1")
user_profiles = extract_user_profiles(soup)
posts_data = []
for block in post_blocks:
post = {}
anchor = block.find("a")
post["message_id"] = anchor.get("name") if anchor else None
poster_td = block.find("td", class_="poster_info")
if poster_td:
user_link = poster_td.find("a")
post["author"] = user_link.text.strip() if user_link else None
post["profile_url"] = user_link["href"] if user_link else None
activity_text = poster_td.get_text()
post["activity"] = extract_after(activity_text, "Activity:")
post["merit"] = extract_after(activity_text, "Merit:")
subject_div = block.find("div", class_="subject")
post["title"] = subject_div.get_text(strip=True) if subject_div else None
date_div = subject_div.find_next_sibling("div") if subject_div else None
post["date"] = date_div.get_text(strip=True) if date_div else None
post_div = block.find("div", class_="post")
if post_div:
post["quotes"] = extract_quotes_recursive(post_div, user_profiles)
post["content"] = post_div.get_text(strip=True)
posts_data.append(post)
return posts_data
def main():
if len(sys.argv) < 2:
print("Usage: python3 post_last.py <URL> [output.json]")
sys.exit(1)
url = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else "bitcointalk_parsed.json"
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
response.raise_for_status()
posts_json = parse_html_posts(response.text)
with open(output_path, "w", encoding="utf-8") as outfile:
json.dump(posts_json, outfile, indent=2, ensure_ascii=False)
print(f"Success! Saved to {output_path}")
except requests.RequestException as e:
print(f"Error fetching URL: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
Output:
https://privatebin.net/?b23d0b444e13d295#5xcuFNDVwzcPdZBjaiJtZoqaijrxUsstVM1G98WycE8zRun it:
python3 post_last.py https://bitcointalk.org/index.php?topic=5546497.0 out.json
With this code we can directly parsing any thread link from the forum and get a JSON of the posts and quotes, that would be a nice input for an automated process to filter the content. A tool like this could be a nice base to patrol the forum, with another script we could get the updated threads directly from the board of our interest, then create all the JSONs and feed the AI agent.