Makes transformer script executable via cli

This commit is contained in:
quorploop 2026-01-27 20:19:05 +01:00
parent 8fae350b34
commit 7c2e34906e
11 changed files with 648 additions and 37 deletions

View file

@ -418,3 +418,52 @@ class FuzzyAuthorNode(TransformNode):
# Return new context with results
return TransformContext(input_df)
def main():
import sys
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
# Connect to database
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
con = sqlite3.connect(db_path)
try:
# Read posts from database
df = pd.read_sql('SELECT * FROM posts;', con)
logger.info(f"Loaded {len(df)} posts from database")
# Create context
context = TransformContext(df)
# Run NerAuthorNode
logger.info("Running NerAuthorNode...")
ner_node = NerAuthorNode(device="mps")
context = ner_node.run(con, context)
logger.info("NerAuthorNode complete")
# Run FuzzyAuthorNode
logger.info("Running FuzzyAuthorNode...")
fuzzy_node = FuzzyAuthorNode(max_l_dist=1)
context = fuzzy_node.run(con, context)
logger.info("FuzzyAuthorNode complete")
logger.info("All author nodes completed successfully!")
except Exception as e:
logger.error(f"Error during transformation: {e}", exc_info=True)
raise
finally:
con.close()
if __name__ == '__main__':
main()