"""Example template node for the transform pipeline. This is a template showing how to create new transform nodes. Copy this file and modify it for your specific transformation needs. """ from pipeline import TransformContext from transform_node import TransformNode import sqlite3 import pandas as pd import logging logger = logging.getLogger("knack-transform") class ExampleNode(TransformNode): """Example transform node template. This node demonstrates the basic structure for creating new transformation nodes in the pipeline. """ def __init__(self, param1: str = "default_value", param2: int = 42, device: str = "cpu"): """Initialize the ExampleNode. Args: param1: Example string parameter param2: Example integer parameter device: Device to use for computations ('cpu', 'cuda', 'mps') """ self.param1 = param1 self.param2 = param2 self.device = device logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}") def _create_tables(self, con: sqlite3.Connection): """Create any necessary tables in the database. This is optional - only needed if your node creates new tables. """ logger.info("Creating example tables") con.execute(""" CREATE TABLE IF NOT EXISTS example_results ( id INTEGER PRIMARY KEY AUTOINCREMENT, post_id INTEGER, result_value TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (post_id) REFERENCES posts(id) ) """) con.commit() def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: """Process the input dataframe. This is where your main transformation logic goes. Args: df: Input dataframe from context Returns: Processed dataframe """ logger.info(f"Processing {len(df)} rows") # Example: Add a new column based on existing data result_df = df.copy() result_df['processed'] = True result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}") logger.info("Processing complete") return result_df def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame): """Store results back to the database. This is optional - only needed if you want to persist results. Args: con: Database connection df: Processed dataframe to store """ if df.empty: logger.info("No results to store") return logger.info(f"Storing {len(df)} results") # Example: Store to database # df[['post_id', 'result_value']].to_sql( # 'example_results', # con, # if_exists='append', # index=False # ) con.commit() logger.info("Results stored successfully") def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext: """Execute the transformation. This is the main entry point called by the pipeline. Args: con: SQLite database connection context: TransformContext containing input dataframe Returns: TransformContext with processed dataframe """ logger.info("Starting ExampleNode transformation") # Get input dataframe from context input_df = context.get_dataframe() # Validate input if input_df.empty: logger.warning("Empty dataframe provided to ExampleNode") return context # Create any necessary tables self._create_tables(con) # Process the data result_df = self._process_data(input_df) # Store results (optional) self._store_results(con, result_df) logger.info("ExampleNode transformation complete") # Return new context with results return TransformContext(result_df) # Example usage: if __name__ == "__main__": # This allows you to test your node independently import os os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform') from pipeline import TransformContext import sqlite3 # Create test data test_df = pd.DataFrame({ 'id': [1, 2, 3], 'author': ['Test Author 1', 'Test Author 2', 'Test Author 3'] }) # Create test database connection test_con = sqlite3.connect(':memory:') # Create and run node node = ExampleNode(param1="test", param2=100) context = TransformContext(test_df) result_context = node.run(test_con, context) # Check results result_df = result_context.get_dataframe() print("\nResult DataFrame:") print(result_df) test_con.close() print("\n✓ ExampleNode test completed successfully!")