Makes transformer script executable via cli

This commit is contained in:
quorploop 2026-01-27 20:19:05 +01:00
parent 8fae350b34
commit 7c2e34906e
11 changed files with 648 additions and 37 deletions

View file

@ -1,4 +1,5 @@
#! python3
import argparse
import logging
import os
import sqlite3
@ -23,9 +24,10 @@ logging.basicConfig(
logger = logging.getLogger("knack-transform")
def setup_database_connection():
def setup_database_connection(db_path=None):
"""Create connection to the SQLite database."""
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
if db_path is None:
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
logger.info(f"Connecting to database: {db_path}")
return sqlite3.connect(db_path)
@ -35,13 +37,12 @@ def table_exists(tablename: str, con: sqlite3.Connection):
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
return len(con.execute(query, [tablename]).fetchall()) > 0
def main():
"""Main entry point for the transform pipeline."""
logger.info("Starting transform pipeline")
def run_from_database(db_path=None):
"""Run the pipeline using database as input and output."""
logger.info("Starting transform pipeline (database mode)")
try:
con = setup_database_connection()
con = setup_database_connection(db_path)
logger.info("Database connection established")
# Check if posts table exists
@ -73,8 +74,9 @@ def main():
max_workers = int(os.environ.get('MAX_WORKERS', 4))
pipeline = create_default_pipeline(device=device, max_workers=max_workers)
effective_db_path = db_path or os.environ.get('DB_PATH', '/data/knack.sqlite')
results = pipeline.run(
db_path=os.environ.get('DB_PATH', '/data/knack.sqlite'),
db_path=effective_db_path,
initial_context=context,
fail_fast=False # Continue even if some nodes fail
)
@ -97,6 +99,49 @@ def main():
con.close()
logger.info("Database connection closed")
def main():
"""Main entry point with command-line argument support."""
parser = argparse.ArgumentParser(
description='Transform pipeline for Knack scraper data',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Run with database (Docker mode)
python main.py
# Run with custom device and workers
python main.py --database /path/to/knack.sqlite --device mps --workers 8
# Run with specific database file
python main.py --database /path/to/knack.sqlite
"""
)
parser.add_argument(
'--database',
help='Path to SQLite database (for database mode). Defaults to DB_PATH env var or /data/knack.sqlite'
)
parser.add_argument(
'--device',
default=os.environ.get('COMPUTE_DEVICE', 'cpu'),
choices=['cpu', 'cuda', 'mps'],
help='Device to use for compute-intensive operations (default: cpu)'
)
parser.add_argument(
'--workers',
type=int,
default=int(os.environ.get('MAX_WORKERS', 4)),
help='Maximum number of parallel workers (default: 4)'
)
args = parser.parse_args()
# Determine mode based on arguments
if args.database:
# Database mode (original behavior)
run_from_database(db_path=args.database)
logger.info("Database connection closed")
if __name__ == "__main__":
main()