forked from lukaszett/Knack-Scraper
Makes transformer script executable via cli
This commit is contained in:
parent
8fae350b34
commit
7c2e34906e
11 changed files with 648 additions and 37 deletions
|
|
@ -1,4 +1,5 @@
|
|||
#! python3
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
|
|
@ -23,9 +24,10 @@ logging.basicConfig(
|
|||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
def setup_database_connection():
|
||||
def setup_database_connection(db_path=None):
|
||||
"""Create connection to the SQLite database."""
|
||||
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
if db_path is None:
|
||||
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
logger.info(f"Connecting to database: {db_path}")
|
||||
return sqlite3.connect(db_path)
|
||||
|
||||
|
|
@ -35,13 +37,12 @@ def table_exists(tablename: str, con: sqlite3.Connection):
|
|||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the transform pipeline."""
|
||||
logger.info("Starting transform pipeline")
|
||||
def run_from_database(db_path=None):
|
||||
"""Run the pipeline using database as input and output."""
|
||||
logger.info("Starting transform pipeline (database mode)")
|
||||
|
||||
try:
|
||||
con = setup_database_connection()
|
||||
con = setup_database_connection(db_path)
|
||||
logger.info("Database connection established")
|
||||
|
||||
# Check if posts table exists
|
||||
|
|
@ -73,8 +74,9 @@ def main():
|
|||
max_workers = int(os.environ.get('MAX_WORKERS', 4))
|
||||
|
||||
pipeline = create_default_pipeline(device=device, max_workers=max_workers)
|
||||
effective_db_path = db_path or os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
results = pipeline.run(
|
||||
db_path=os.environ.get('DB_PATH', '/data/knack.sqlite'),
|
||||
db_path=effective_db_path,
|
||||
initial_context=context,
|
||||
fail_fast=False # Continue even if some nodes fail
|
||||
)
|
||||
|
|
@ -97,6 +99,49 @@ def main():
|
|||
con.close()
|
||||
logger.info("Database connection closed")
|
||||
|
||||
def main():
|
||||
"""Main entry point with command-line argument support."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Transform pipeline for Knack scraper data',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Run with database (Docker mode)
|
||||
python main.py
|
||||
|
||||
# Run with custom device and workers
|
||||
python main.py --database /path/to/knack.sqlite --device mps --workers 8
|
||||
|
||||
# Run with specific database file
|
||||
python main.py --database /path/to/knack.sqlite
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--database',
|
||||
help='Path to SQLite database (for database mode). Defaults to DB_PATH env var or /data/knack.sqlite'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--device',
|
||||
default=os.environ.get('COMPUTE_DEVICE', 'cpu'),
|
||||
choices=['cpu', 'cuda', 'mps'],
|
||||
help='Device to use for compute-intensive operations (default: cpu)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--workers',
|
||||
type=int,
|
||||
default=int(os.environ.get('MAX_WORKERS', 4)),
|
||||
help='Maximum number of parallel workers (default: 4)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine mode based on arguments
|
||||
if args.database:
|
||||
# Database mode (original behavior)
|
||||
run_from_database(db_path=args.database)
|
||||
logger.info("Database connection closed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue