Adds TransformNode to FuzzyFind Author Names

2025-12-23 17:53:37 +01:00 · 2025-12-23 17:53:37 +01:00 · 72765532d3
commit 72765532d3
parent 64df8fb328
11 changed files with 696 additions and 58 deletions
--- a/transform/example_node.py
+++ b/transform/example_node.py
@ -0,0 +1,170 @@
+"""Example template node for the transform pipeline.
+
+This is a template showing how to create new transform nodes.
+Copy this file and modify it for your specific transformation needs.
+"""
+from pipeline import TransformContext
+from transform_node import TransformNode
+import sqlite3
+import pandas as pd
+import logging
+
+logger = logging.getLogger("knack-transform")
+
+
+class ExampleNode(TransformNode):
+    """Example transform node template.
+    
+    This node demonstrates the basic structure for creating
+    new transformation nodes in the pipeline.
+    """
+    
+    def __init__(self, 
+                 param1: str = "default_value",
+                 param2: int = 42,
+                 device: str = "cpu"):
+        """Initialize the ExampleNode.
+        
+        Args:
+            param1: Example string parameter
+            param2: Example integer parameter
+            device: Device to use for computations ('cpu', 'cuda', 'mps')
+        """
+        self.param1 = param1
+        self.param2 = param2
+        self.device = device
+        logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
+    
+    def _create_tables(self, con: sqlite3.Connection):
+        """Create any necessary tables in the database.
+        
+        This is optional - only needed if your node creates new tables.
+        """
+        logger.info("Creating example tables")
+        
+        con.execute("""
+            CREATE TABLE IF NOT EXISTS example_results (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                post_id INTEGER,
+                result_value TEXT,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                FOREIGN KEY (post_id) REFERENCES posts(id)
+            )
+        """)
+        
+        con.commit()
+    
+    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Process the input dataframe.
+        
+        This is where your main transformation logic goes.
+        
+        Args:
+            df: Input dataframe from context
+            
+        Returns:
+            Processed dataframe
+        """
+        logger.info(f"Processing {len(df)} rows")
+        
+        # Example: Add a new column based on existing data
+        result_df = df.copy()
+        result_df['processed'] = True
+        result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
+        
+        logger.info("Processing complete")
+        return result_df
+    
+    def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
+        """Store results back to the database.
+        
+        This is optional - only needed if you want to persist results.
+        
+        Args:
+            con: Database connection
+            df: Processed dataframe to store
+        """
+        if df.empty:
+            logger.info("No results to store")
+            return
+        
+        logger.info(f"Storing {len(df)} results")
+        
+        # Example: Store to database
+        # df[['post_id', 'result_value']].to_sql(
+        #     'example_results', 
+        #     con, 
+        #     if_exists='append', 
+        #     index=False
+        # )
+        
+        con.commit()
+        logger.info("Results stored successfully")
+    
+    def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
+        """Execute the transformation.
+        
+        This is the main entry point called by the pipeline.
+        
+        Args:
+            con: SQLite database connection
+            context: TransformContext containing input dataframe
+            
+        Returns:
+            TransformContext with processed dataframe
+        """
+        logger.info("Starting ExampleNode transformation")
+        
+        # Get input dataframe from context
+        input_df = context.get_dataframe()
+        
+        # Validate input
+        if input_df.empty:
+            logger.warning("Empty dataframe provided to ExampleNode")
+            return context
+        
+        # Create any necessary tables
+        self._create_tables(con)
+        
+        # Process the data
+        result_df = self._process_data(input_df)
+        
+        # Store results (optional)
+        self._store_results(con, result_df)
+        
+        logger.info("ExampleNode transformation complete")
+        
+        # Return new context with results
+        return TransformContext(result_df)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # This allows you to test your node independently
+    import os
+    os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
+    
+    from pipeline import TransformContext
+    import sqlite3
+    
+    # Create test data
+    test_df = pd.DataFrame({
+        'id': [1, 2, 3],
+        'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
+    })
+    
+    # Create test database connection
+    test_con = sqlite3.connect(':memory:')
+    
+    # Create and run node
+    node = ExampleNode(param1="test", param2=100)
+    context = TransformContext(test_df)
+    result_context = node.run(test_con, context)
+    
+    # Check results
+    result_df = result_context.get_dataframe()
+    print("\nResult DataFrame:")
+    print(result_df)
+    
+    test_con.close()
+    print("\n✓ ExampleNode test completed successfully!")