forked from lukaszett/Knack-Scraper
170 lines
5.1 KiB
Python
170 lines
5.1 KiB
Python
"""Example template node for the transform pipeline.
|
|
|
|
This is a template showing how to create new transform nodes.
|
|
Copy this file and modify it for your specific transformation needs.
|
|
"""
|
|
from pipeline import TransformContext
|
|
from transform_node import TransformNode
|
|
import sqlite3
|
|
import pandas as pd
|
|
import logging
|
|
|
|
logger = logging.getLogger("knack-transform")
|
|
|
|
|
|
class ExampleNode(TransformNode):
|
|
"""Example transform node template.
|
|
|
|
This node demonstrates the basic structure for creating
|
|
new transformation nodes in the pipeline.
|
|
"""
|
|
|
|
def __init__(self,
|
|
param1: str = "default_value",
|
|
param2: int = 42,
|
|
device: str = "cpu"):
|
|
"""Initialize the ExampleNode.
|
|
|
|
Args:
|
|
param1: Example string parameter
|
|
param2: Example integer parameter
|
|
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
|
"""
|
|
self.param1 = param1
|
|
self.param2 = param2
|
|
self.device = device
|
|
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
|
|
|
|
def _create_tables(self, con: sqlite3.Connection):
|
|
"""Create any necessary tables in the database.
|
|
|
|
This is optional - only needed if your node creates new tables.
|
|
"""
|
|
logger.info("Creating example tables")
|
|
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS example_results (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
post_id INTEGER,
|
|
result_value TEXT,
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
FOREIGN KEY (post_id) REFERENCES posts(id)
|
|
)
|
|
""")
|
|
|
|
con.commit()
|
|
|
|
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Process the input dataframe.
|
|
|
|
This is where your main transformation logic goes.
|
|
|
|
Args:
|
|
df: Input dataframe from context
|
|
|
|
Returns:
|
|
Processed dataframe
|
|
"""
|
|
logger.info(f"Processing {len(df)} rows")
|
|
|
|
# Example: Add a new column based on existing data
|
|
result_df = df.copy()
|
|
result_df['processed'] = True
|
|
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
|
|
|
|
logger.info("Processing complete")
|
|
return result_df
|
|
|
|
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
|
"""Store results back to the database.
|
|
|
|
This is optional - only needed if you want to persist results.
|
|
|
|
Args:
|
|
con: Database connection
|
|
df: Processed dataframe to store
|
|
"""
|
|
if df.empty:
|
|
logger.info("No results to store")
|
|
return
|
|
|
|
logger.info(f"Storing {len(df)} results")
|
|
|
|
# Example: Store to database
|
|
# df[['post_id', 'result_value']].to_sql(
|
|
# 'example_results',
|
|
# con,
|
|
# if_exists='append',
|
|
# index=False
|
|
# )
|
|
|
|
con.commit()
|
|
logger.info("Results stored successfully")
|
|
|
|
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
|
"""Execute the transformation.
|
|
|
|
This is the main entry point called by the pipeline.
|
|
|
|
Args:
|
|
con: SQLite database connection
|
|
context: TransformContext containing input dataframe
|
|
|
|
Returns:
|
|
TransformContext with processed dataframe
|
|
"""
|
|
logger.info("Starting ExampleNode transformation")
|
|
|
|
# Get input dataframe from context
|
|
input_df = context.get_dataframe()
|
|
|
|
# Validate input
|
|
if input_df.empty:
|
|
logger.warning("Empty dataframe provided to ExampleNode")
|
|
return context
|
|
|
|
# Create any necessary tables
|
|
self._create_tables(con)
|
|
|
|
# Process the data
|
|
result_df = self._process_data(input_df)
|
|
|
|
# Store results (optional)
|
|
self._store_results(con, result_df)
|
|
|
|
logger.info("ExampleNode transformation complete")
|
|
|
|
# Return new context with results
|
|
return TransformContext(result_df)
|
|
|
|
|
|
# Example usage:
|
|
if __name__ == "__main__":
|
|
# This allows you to test your node independently
|
|
import os
|
|
os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
|
|
|
|
from pipeline import TransformContext
|
|
import sqlite3
|
|
|
|
# Create test data
|
|
test_df = pd.DataFrame({
|
|
'id': [1, 2, 3],
|
|
'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
|
|
})
|
|
|
|
# Create test database connection
|
|
test_con = sqlite3.connect(':memory:')
|
|
|
|
# Create and run node
|
|
node = ExampleNode(param1="test", param2=100)
|
|
context = TransformContext(test_df)
|
|
result_context = node.run(test_con, context)
|
|
|
|
# Check results
|
|
result_df = result_context.get_dataframe()
|
|
print("\nResult DataFrame:")
|
|
print(result_df)
|
|
|
|
test_con.close()
|
|
print("\n✓ ExampleNode test completed successfully!")
|