Knack-Scraper/transform/example_node.py

"""Example template node for the transform pipeline.

This is a template showing how to create new transform nodes.
Copy this file and modify it for your specific transformation needs.
"""
from pipeline import TransformContext
from transform_node import TransformNode
import sqlite3
import pandas as pd
import logging

logger = logging.getLogger("knack-transform")


class ExampleNode(TransformNode):
    """Example transform node template.

    This node demonstrates the basic structure for creating
    new transformation nodes in the pipeline.
    """

    def __init__(self,
                 param1: str = "default_value",
                 param2: int = 42,
                 device: str = "cpu"):
        """Initialize the ExampleNode.

        Args:
            param1: Example string parameter
            param2: Example integer parameter
            device: Device to use for computations ('cpu', 'cuda', 'mps')
        """
        self.param1 = param1
        self.param2 = param2
        self.device = device
        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")

    def _create_tables(self, con: sqlite3.Connection):
        """Create any necessary tables in the database.

        This is optional - only needed if your node creates new tables.
        """
        logger.info("Creating example tables")

        con.execute("""
            CREATE TABLE IF NOT EXISTS example_results (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                post_id INTEGER,
                result_value TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (post_id) REFERENCES posts(id)
            )
        """)

        con.commit()

    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.

        This is where your main transformation logic goes.

        Args:
            df: Input dataframe from context

        Returns:
            Processed dataframe
        """
        logger.info(f"Processing {len(df)} rows")

        # Example: Add a new column based on existing data
        result_df = df.copy()
        result_df['processed'] = True
        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")

        logger.info("Processing complete")
        return result_df

    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
        """Store results back to the database.

        This is optional - only needed if you want to persist results.

        Args:
            con: Database connection
            df: Processed dataframe to store
        """
        if df.empty:
            logger.info("No results to store")
            return

        logger.info(f"Storing {len(df)} results")

        # Example: Store to database
        # df[['post_id', 'result_value']].to_sql(
        #     'example_results',
        #     con,
        #     if_exists='append',
        #     index=False
        # )

        con.commit()
        logger.info("Results stored successfully")

    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
        """Execute the transformation.

        This is the main entry point called by the pipeline.

        Args:
            con: SQLite database connection
            context: TransformContext containing input dataframe

        Returns:
            TransformContext with processed dataframe
        """
        logger.info("Starting ExampleNode transformation")

        # Get input dataframe from context
        input_df = context.get_dataframe()

        # Validate input
        if input_df.empty:
            logger.warning("Empty dataframe provided to ExampleNode")
            return context

        # Create any necessary tables
        self._create_tables(con)

        # Process the data
        result_df = self._process_data(input_df)

        # Store results (optional)
        self._store_results(con, result_df)

        logger.info("ExampleNode transformation complete")

        # Return new context with results
        return TransformContext(result_df)


# Example usage:
if __name__ == "__main__":
    # This allows you to test your node independently
    import os
    os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')

    from pipeline import TransformContext
    import sqlite3

    # Create test data
    test_df = pd.DataFrame({
        'id': [1, 2, 3],
        'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
    })

    # Create test database connection
    test_con = sqlite3.connect(':memory:')

    # Create and run node
    node = ExampleNode(param1="test", param2=100)
    context = TransformContext(test_df)
    result_context = node.run(test_con, context)

    # Check results
    result_df = result_context.get_dataframe()
    print("\nResult DataFrame:")
    print(result_df)

    test_con.close()
    print("\n✓ ExampleNode test completed successfully!")