Implements Feature to cleanup authors freetext field
This commit is contained in:
parent
bcd210ce01
commit
64df8fb328
14 changed files with 804 additions and 310 deletions
89
transform/main.py
Normal file
89
transform/main.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#! python3
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||
logging_level = logging.INFO
|
||||
else:
|
||||
logging_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
def setup_database_connection():
|
||||
"""Create connection to the SQLite database."""
|
||||
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
logger.info(f"Connecting to database: {db_path}")
|
||||
return sqlite3.connect(db_path)
|
||||
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
"""Check if a table exists in the database."""
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the transform pipeline."""
|
||||
logger.info("Starting transform pipeline")
|
||||
|
||||
try:
|
||||
con = setup_database_connection()
|
||||
logger.info("Database connection established")
|
||||
|
||||
# Check if posts table exists
|
||||
if not table_exists('posts', con):
|
||||
logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
|
||||
logger.info("Transform pipeline skipped - no data available")
|
||||
return
|
||||
|
||||
# Import transform nodes
|
||||
from author_node import AuthorNode
|
||||
from base import TransformContext
|
||||
import pandas as pd
|
||||
|
||||
# Load posts data
|
||||
logger.info("Loading posts from database")
|
||||
sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
|
||||
MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
|
||||
df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
|
||||
logger.info(f"Loaded {len(df)} uncleaned posts with authors")
|
||||
|
||||
if df.empty:
|
||||
logger.info("No uncleaned posts found. Transform pipeline skipped.")
|
||||
return
|
||||
|
||||
# Create context and run author classification
|
||||
context = TransformContext(df)
|
||||
author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu')) # Change to "cuda" or "mps" if available
|
||||
result_context = author_transform.run(con, context)
|
||||
|
||||
# TODO: Create Node to compute Text Embeddings and UMAP.
|
||||
# TODO: Create Node to pre-compute data based on visuals to reduce load time.
|
||||
|
||||
logger.info("Transform pipeline completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transform pipeline: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
if 'con' in locals():
|
||||
con.close()
|
||||
logger.info("Database connection closed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue