forked from lukaszett/Knack-Scraper
Adds TransformNode to FuzzyFind Author Names
This commit is contained in:
parent
64df8fb328
commit
72765532d3
11 changed files with 696 additions and 58 deletions
170
transform/example_node.py
Normal file
170
transform/example_node.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
"""Example template node for the transform pipeline.
|
||||
|
||||
This is a template showing how to create new transform nodes.
|
||||
Copy this file and modify it for your specific transformation needs.
|
||||
"""
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
class ExampleNode(TransformNode):
|
||||
"""Example transform node template.
|
||||
|
||||
This node demonstrates the basic structure for creating
|
||||
new transformation nodes in the pipeline.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
param1: str = "default_value",
|
||||
param2: int = 42,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the ExampleNode.
|
||||
|
||||
Args:
|
||||
param1: Example string parameter
|
||||
param2: Example integer parameter
|
||||
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.param1 = param1
|
||||
self.param2 = param2
|
||||
self.device = device
|
||||
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create any necessary tables in the database.
|
||||
|
||||
This is optional - only needed if your node creates new tables.
|
||||
"""
|
||||
logger.info("Creating example tables")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS example_results (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
post_id INTEGER,
|
||||
result_value TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
||||
This is where your main transformation logic goes.
|
||||
|
||||
Args:
|
||||
df: Input dataframe from context
|
||||
|
||||
Returns:
|
||||
Processed dataframe
|
||||
"""
|
||||
logger.info(f"Processing {len(df)} rows")
|
||||
|
||||
# Example: Add a new column based on existing data
|
||||
result_df = df.copy()
|
||||
result_df['processed'] = True
|
||||
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
|
||||
"""Store results back to the database.
|
||||
|
||||
This is optional - only needed if you want to persist results.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
df: Processed dataframe to store
|
||||
"""
|
||||
if df.empty:
|
||||
logger.info("No results to store")
|
||||
return
|
||||
|
||||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Example: Store to database
|
||||
# df[['post_id', 'result_value']].to_sql(
|
||||
# 'example_results',
|
||||
# con,
|
||||
# if_exists='append',
|
||||
# index=False
|
||||
# )
|
||||
|
||||
con.commit()
|
||||
logger.info("Results stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
This is the main entry point called by the pipeline.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe
|
||||
"""
|
||||
logger.info("Starting ExampleNode transformation")
|
||||
|
||||
# Get input dataframe from context
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
# Validate input
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe provided to ExampleNode")
|
||||
return context
|
||||
|
||||
# Create any necessary tables
|
||||
self._create_tables(con)
|
||||
|
||||
# Process the data
|
||||
result_df = self._process_data(input_df)
|
||||
|
||||
# Store results (optional)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("ExampleNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
|
||||
# Example usage:
|
||||
if __name__ == "__main__":
|
||||
# This allows you to test your node independently
|
||||
import os
|
||||
os.chdir('/Users/linussilberstein/Documents/Knack-Scraper/transform')
|
||||
|
||||
from pipeline import TransformContext
|
||||
import sqlite3
|
||||
|
||||
# Create test data
|
||||
test_df = pd.DataFrame({
|
||||
'id': [1, 2, 3],
|
||||
'author': ['Test Author 1', 'Test Author 2', 'Test Author 3']
|
||||
})
|
||||
|
||||
# Create test database connection
|
||||
test_con = sqlite3.connect(':memory:')
|
||||
|
||||
# Create and run node
|
||||
node = ExampleNode(param1="test", param2=100)
|
||||
context = TransformContext(test_df)
|
||||
result_context = node.run(test_con, context)
|
||||
|
||||
# Check results
|
||||
result_df = result_context.get_dataframe()
|
||||
print("\nResult DataFrame:")
|
||||
print(result_df)
|
||||
|
||||
test_con.close()
|
||||
print("\n✓ ExampleNode test completed successfully!")
|
||||
Loading…
Add table
Add a link
Reference in a new issue