Knack-Scraper/transform/url_node.py

"""Nodes to extract URL in text using regex patterns."""
import sqlite3
import pandas as pd
import logging
import re
from urllib.parse import urlparse

from pipeline import TransformContext
from transform_node import TransformNode

logger = logging.getLogger("knack-transform")

class URLNode(TransformNode):
    """Node that looks for URLs in the text-column in posts.
    Stores data in a new table urls:
    - id, post_id, url_raw, tld, host
    """

    def __init__(self):
        super().__init__()
        logger.info("Init URL Node")

    def _create_tables(self, con: sqlite3.Connection):
        """Create urls table if they don't exist."""
        con.execute("""
            CREATE TABLE IF NOT EXISTS urls (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    post_id INTEGER,
                    url_raw TEXT,
                    tld TEXT,
                    host TEXT,
                    FOREIGN KEY (post_id) REFERENCES posts(id)
            )
                    """)

        con.commit()

    def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Processing {len(input_df)} rows")

        mappings = []
        for _, post_row in input_df.iterrows():
            post_id = post_row['id']
            post_text = post_row['text']

            pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"

            urls = re.findall(pattern, post_text)
            logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")

            for url in urls:
                try:
                    parsed = urlparse(url)
                    hostname = parsed.netloc

                    # If the hostname starts with www. remove that part.
                    if hostname[:4] == 'www.':
                        hostname = hostname[4:]

                    # Extract TLD (last part after the last dot)
                    tld = ""
                    if hostname:
                        parts = hostname.split('.')
                        if len(parts) > 0:
                            tld = parts[-1]

                    mappings.append({
                        'post_id': post_id,
                        'url_raw': url,
                        'host': hostname,
                        'tld': tld
                    })
                    logger.debug(f"  URL: {url} -> Host: {hostname}, TLD: {tld}")
                except Exception as e:
                    logger.warning(f"Failed to parse URL {url}: {e}")

        result_df = pd.DataFrame(mappings)
        logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
        return result_df


    def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
        if result_df.empty:
            logger.info("No URLs to store")
            return

        result_df.to_sql('urls', con, if_exists='append', index=False)
        logger.info(f"Stored {len(result_df)} URLs to database")

    def run(self, con: sqlite3.Connection, context: TransformContext):
        """Executes the URL Node.
        Writes to a new table urls and creates said table if it does not
        exist currently.

        Args:
            con (sqlite3.Connection): SQLite database connection
            context (TransformContext): Transformcontext,
                containing the input dataframe of all posts

        Returns:
            TransformContext with processed dataframe.
        """
        logger.info("Starting URLNode transformation")

        input_df = context.get_dataframe()

        if input_df.empty:
            logger.warning("Empty dataframe. Skipping URLNode")
            return context

        self._create_tables(con)
        result_df = self._process_data(input_df)
        self._store_results(con, result_df)

        logger.info("Node transformation complete")

        return TransformContext(input_df)

def main():
    import sys

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout)
        ]
    )
    logger = logging.getLogger("knack-transform")

    # Connect to database
    db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
    con = sqlite3.connect(db_path)

    try:
        # Read posts from database
        df = pd.read_sql('SELECT * FROM posts;', con)
        logger.info(f"Loaded {len(df)} posts from database")

        # Create context
        context = TransformContext(df)

        # Run NerAuthorNode
        logger.info("Running NerAuthorNode...")
        node = URLNode()
        context = node.run(con, context)
        logger.info("NerAuthorNode complete")


        logger.info("All author nodes completed successfully!")

    except Exception as e:
        logger.error(f"Error during transformation: {e}", exc_info=True)
        raise
    finally:
        con.close()


if __name__ == '__main__':
    main()