"""Nodes to extract URL in text using regex patterns.""" import sqlite3 import pandas as pd import logging import re from urllib.parse import urlparse from pipeline import TransformContext from transform_node import TransformNode logger = logging.getLogger("knack-transform") class URLNode(TransformNode): """Node that looks for URLs in the text-column in posts. Stores data in a new table urls: - id, post_id, url_raw, tld, host """ def __init__(self): super().__init__() logger.info("Init URL Node") def _create_tables(self, con: sqlite3.Connection): """Create urls table if they don't exist.""" con.execute(""" CREATE TABLE IF NOT EXISTS urls ( id INTEGER PRIMARY KEY AUTOINCREMENT, post_id INTEGER, url_raw TEXT, tld TEXT, host TEXT, FOREIGN KEY (post_id) REFERENCES posts(id) ) """) con.commit() def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame: logger.info(f"Processing {len(input_df)} rows") mappings = [] for _, post_row in input_df.iterrows(): post_id = post_row['id'] post_text = post_row['text'] pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*" urls = re.findall(pattern, post_text) logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}") for url in urls: try: parsed = urlparse(url) hostname = parsed.netloc # If the hostname starts with www. remove that part. if hostname[:4] == 'www.': hostname = hostname[4:] # Extract TLD (last part after the last dot) tld = "" if hostname: parts = hostname.split('.') if len(parts) > 0: tld = parts[-1] mappings.append({ 'post_id': post_id, 'url_raw': url, 'host': hostname, 'tld': tld }) logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}") except Exception as e: logger.warning(f"Failed to parse URL {url}: {e}") result_df = pd.DataFrame(mappings) logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts") return result_df def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame): if result_df.empty: logger.info("No URLs to store") return result_df.to_sql('urls', con, if_exists='append', index=False) logger.info(f"Stored {len(result_df)} URLs to database") def run(self, con: sqlite3.Connection, context: TransformContext): """Executes the URL Node. Writes to a new table urls and creates said table if it does not exist currently. Args: con (sqlite3.Connection): SQLite database connection context (TransformContext): Transformcontext, containing the input dataframe of all posts Returns: TransformContext with processed dataframe. """ logger.info("Starting URLNode transformation") input_df = context.get_dataframe() if input_df.empty: logger.warning("Empty dataframe. Skipping URLNode") return context self._create_tables(con) result_df = self._process_data(input_df) self._store_results(con, result_df) logger.info("Node transformation complete") return TransformContext(input_df) def main(): import sys logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger("knack-transform") # Connect to database db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite" con = sqlite3.connect(db_path) try: # Read posts from database df = pd.read_sql('SELECT * FROM posts;', con) logger.info(f"Loaded {len(df)} posts from database") # Create context context = TransformContext(df) # Run NerAuthorNode logger.info("Running NerAuthorNode...") node = URLNode() context = node.run(con, context) logger.info("NerAuthorNode complete") logger.info("All author nodes completed successfully!") except Exception as e: logger.error(f"Error during transformation: {e}", exc_info=True) raise finally: con.close() if __name__ == '__main__': main()