Adds TransformNode to FuzzyFind Author Names

This commit is contained in:
quorploop 2025-12-23 17:53:37 +01:00
parent 64df8fb328
commit 72765532d3
11 changed files with 696 additions and 58 deletions

170
transform/example_node.py Normal file
View file

@ -0,0 +1,170 @@
"""Example template node for the transform pipeline.
This is a template showing how to create new transform nodes.
Copy this file and modify it for your specific transformation needs.
"""
from pipeline import TransformContext
from transform_node import TransformNode
import sqlite3
import pandas as pd
import logging
logger = logging.getLogger("knack-transform")
class ExampleNode(TransformNode):
"""Example transform node template.
This node demonstrates the basic structure for creating
new transformation nodes in the pipeline.
"""
def __init__(self,
param1: str = "default_value",
param2: int = 42,
device: str = "cpu"):
"""Initialize the ExampleNode.
Args:
param1: Example string parameter
param2: Example integer parameter
device: Device to use for computations ('cpu', 'cuda', 'mps')
"""
self.param1 = param1
self.param2 = param2
self.device = device
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
def _create_tables(self, con: sqlite3.Connection):
"""Create any necessary tables in the database.
This is optional - only needed if your node creates new tables.
"""
logger.info("Creating example tables")
con.execute("""
CREATE TABLE IF NOT EXISTS example_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
result_value TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (post_id) REFERENCES posts(id)
)
""")
con.commit()
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Process the input dataframe.
This is where your main transformation logic goes.
Args:
df: Input dataframe from context
Returns:
Processed dataframe
"""
logger.info(f"Processing {len(df)} rows")
# Example: Add a new column based on existing data
result_df = df.copy()
result_df['processed'] = True
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
logger.info("Processing complete")
return result_df
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
"""Store results back to the database.
This is optional - only needed if you want to persist results.
Args:
con: Database connection
df: Processed dataframe to store
"""
if df.empty:
logger.info("No results to store")
return
logger.info(f"Storing {len(df)} results")
# Example: Store to database
# df[['post_id', 'result_value']].to_sql(
# 'example_results',
# con,
# if_exists='append',
# index=False
# )
con.commit()
logger.info("Results stored successfully")
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
"""Execute the transformation.
This is the main entry point called by the pipeline.
Args:
con: SQLite database connection
context: TransformContext containing input dataframe
Returns:
TransformContext with processed dataframe
"""
logger.info("Starting ExampleNode transformation")
# Get input dataframe from context
input_df = context.get_dataframe()
# Validate input
if input_df.empty:
logger.warning("Empty dataframe provided to ExampleNode")
return context
# Create any necessary tables
self._create_tables(con)
# Process the data
result_df = self._process_data(input_df)
# Store results (optional)
self._store_results(con, result_df)
logger.info("ExampleNode transformation complete")
# Return new context with results
return TransformContext(result_df)
# Example usage:
if __name__ == "__main__":
# This allows you to test your node independently
import os
os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
from pipeline import TransformContext
import sqlite3
# Create test data
test_df = pd.DataFrame({
'id': [1, 2, 3],
'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
})
# Create test database connection
test_con = sqlite3.connect(':memory:')
# Create and run node
node = ExampleNode(param1="test", param2=100)
context = TransformContext(test_df)
result_context = node.run(test_con, context)
# Check results
result_df = result_context.get_dataframe()
print("\nResult DataFrame:")
print(result_df)
test_con.close()
print("\n✓ ExampleNode test completed successfully!")