Knack-Scraper/transform/url_node.py
2026-01-27 20:19:05 +01:00

160 lines
No EOL
5.1 KiB
Python

"""Nodes to extract URL in text using regex patterns."""
import sqlite3
import pandas as pd
import logging
import re
from urllib.parse import urlparse
from pipeline import TransformContext
from transform_node import TransformNode
logger = logging.getLogger("knack-transform")
class URLNode(TransformNode):
"""Node that looks for URLs in the text-column in posts.
Stores data in a new table urls:
- id, post_id, url_raw, tld, host
"""
def __init__(self):
super().__init__()
logger.info("Init URL Node")
def _create_tables(self, con: sqlite3.Connection):
"""Create urls table if they don't exist."""
con.execute("""
CREATE TABLE IF NOT EXISTS urls (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
url_raw TEXT,
tld TEXT,
host TEXT,
FOREIGN KEY (post_id) REFERENCES posts(id)
)
""")
con.commit()
def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
logger.info(f"Processing {len(input_df)} rows")
mappings = []
for _, post_row in input_df.iterrows():
post_id = post_row['id']
post_text = post_row['text']
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
urls = re.findall(pattern, post_text)
logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
for url in urls:
try:
parsed = urlparse(url)
hostname = parsed.netloc
# If the hostname starts with www. remove that part.
if hostname[:4] == 'www.':
hostname = hostname[4:]
# Extract TLD (last part after the last dot)
tld = ""
if hostname:
parts = hostname.split('.')
if len(parts) > 0:
tld = parts[-1]
mappings.append({
'post_id': post_id,
'url_raw': url,
'host': hostname,
'tld': tld
})
logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}")
except Exception as e:
logger.warning(f"Failed to parse URL {url}: {e}")
result_df = pd.DataFrame(mappings)
logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
return result_df
def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
if result_df.empty:
logger.info("No URLs to store")
return
result_df.to_sql('urls', con, if_exists='append', index=False)
logger.info(f"Stored {len(result_df)} URLs to database")
def run(self, con: sqlite3.Connection, context: TransformContext):
"""Executes the URL Node.
Writes to a new table urls and creates said table if it does not
exist currently.
Args:
con (sqlite3.Connection): SQLite database connection
context (TransformContext): Transformcontext,
containing the input dataframe of all posts
Returns:
TransformContext with processed dataframe.
"""
logger.info("Starting URLNode transformation")
input_df = context.get_dataframe()
if input_df.empty:
logger.warning("Empty dataframe. Skipping URLNode")
return context
self._create_tables(con)
result_df = self._process_data(input_df)
self._store_results(con, result_df)
logger.info("Node transformation complete")
return TransformContext(input_df)
def main():
import sys
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
# Connect to database
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
con = sqlite3.connect(db_path)
try:
# Read posts from database
df = pd.read_sql('SELECT * FROM posts;', con)
logger.info(f"Loaded {len(df)} posts from database")
# Create context
context = TransformContext(df)
# Run NerAuthorNode
logger.info("Running NerAuthorNode...")
node = URLNode()
context = node.run(con, context)
logger.info("NerAuthorNode complete")
logger.info("All author nodes completed successfully!")
except Exception as e:
logger.error(f"Error during transformation: {e}", exc_info=True)
raise
finally:
con.close()
if __name__ == '__main__':
main()