forked from lukaszett/Knack-Scraper
Makes transformer script executable via cli
This commit is contained in:
parent
8fae350b34
commit
7c2e34906e
11 changed files with 648 additions and 37 deletions
160
transform/url_node.py
Normal file
160
transform/url_node.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
"""Nodes to extract URL in text using regex patterns."""
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
class URLNode(TransformNode):
|
||||
"""Node that looks for URLs in the text-column in posts.
|
||||
Stores data in a new table urls:
|
||||
- id, post_id, url_raw, tld, host
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
logger.info("Init URL Node")
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create urls table if they don't exist."""
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS urls (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
post_id INTEGER,
|
||||
url_raw TEXT,
|
||||
tld TEXT,
|
||||
host TEXT,
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
|
||||
logger.info(f"Processing {len(input_df)} rows")
|
||||
|
||||
mappings = []
|
||||
for _, post_row in input_df.iterrows():
|
||||
post_id = post_row['id']
|
||||
post_text = post_row['text']
|
||||
|
||||
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
|
||||
|
||||
urls = re.findall(pattern, post_text)
|
||||
logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
hostname = parsed.netloc
|
||||
|
||||
# If the hostname starts with www. remove that part.
|
||||
if hostname[:4] == 'www.':
|
||||
hostname = hostname[4:]
|
||||
|
||||
# Extract TLD (last part after the last dot)
|
||||
tld = ""
|
||||
if hostname:
|
||||
parts = hostname.split('.')
|
||||
if len(parts) > 0:
|
||||
tld = parts[-1]
|
||||
|
||||
mappings.append({
|
||||
'post_id': post_id,
|
||||
'url_raw': url,
|
||||
'host': hostname,
|
||||
'tld': tld
|
||||
})
|
||||
logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse URL {url}: {e}")
|
||||
|
||||
result_df = pd.DataFrame(mappings)
|
||||
logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
|
||||
return result_df
|
||||
|
||||
|
||||
def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
|
||||
if result_df.empty:
|
||||
logger.info("No URLs to store")
|
||||
return
|
||||
|
||||
result_df.to_sql('urls', con, if_exists='append', index=False)
|
||||
logger.info(f"Stored {len(result_df)} URLs to database")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext):
|
||||
"""Executes the URL Node.
|
||||
Writes to a new table urls and creates said table if it does not
|
||||
exist currently.
|
||||
|
||||
Args:
|
||||
con (sqlite3.Connection): SQLite database connection
|
||||
context (TransformContext): Transformcontext,
|
||||
containing the input dataframe of all posts
|
||||
|
||||
Returns:
|
||||
TransformContext with processed dataframe.
|
||||
"""
|
||||
logger.info("Starting URLNode transformation")
|
||||
|
||||
input_df = context.get_dataframe()
|
||||
|
||||
if input_df.empty:
|
||||
logger.warning("Empty dataframe. Skipping URLNode")
|
||||
return context
|
||||
|
||||
self._create_tables(con)
|
||||
result_df = self._process_data(input_df)
|
||||
self._store_results(con, result_df)
|
||||
|
||||
logger.info("Node transformation complete")
|
||||
|
||||
return TransformContext(input_df)
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
# Connect to database
|
||||
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
try:
|
||||
# Read posts from database
|
||||
df = pd.read_sql('SELECT * FROM posts;', con)
|
||||
logger.info(f"Loaded {len(df)} posts from database")
|
||||
|
||||
# Create context
|
||||
context = TransformContext(df)
|
||||
|
||||
# Run NerAuthorNode
|
||||
logger.info("Running NerAuthorNode...")
|
||||
node = URLNode()
|
||||
context = node.run(con, context)
|
||||
logger.info("NerAuthorNode complete")
|
||||
|
||||
|
||||
logger.info("All author nodes completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue