initial commit
This commit is contained in:
		
						commit
						7edf451e2e
					
				
					 6 changed files with 200 additions and 0 deletions
				
			
		
							
								
								
									
										3
									
								
								.gitignore
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,3 @@ | |||
| data/ | ||||
| venv/ | ||||
| .DS_STORE | ||||
							
								
								
									
										15
									
								
								Dockerfile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								Dockerfile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,15 @@ | |||
| FROM python:slim | ||||
| 
 | ||||
| RUN mkdir /app | ||||
| RUN mkdir /data | ||||
| 
 | ||||
| WORKDIR /app | ||||
| COPY requirements.txt . | ||||
| RUN pip install --no-cache-dir -r requirements.txt | ||||
| 
 | ||||
| RUN apt update -y | ||||
| RUN apt install -y cron | ||||
| COPY crontab . | ||||
| RUN crontab crontab | ||||
| 
 | ||||
| COPY main.py . | ||||
							
								
								
									
										2
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,2 @@ | |||
| build: | ||||
| 	docker build -t knack-scraper . | ||||
							
								
								
									
										1
									
								
								crontab
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								crontab
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| 5 4 * * * python /app/main.py | ||||
							
								
								
									
										165
									
								
								main.py
									
										
									
									
									
										Executable file
									
								
							
							
						
						
									
										165
									
								
								main.py
									
										
									
									
									
										Executable file
									
								
							|  | @ -0,0 +1,165 @@ | |||
| #! python | ||||
| import locale | ||||
| import logging | ||||
| import os | ||||
| import sqlite3 | ||||
| import sys | ||||
| import time | ||||
| from concurrent.futures import ThreadPoolExecutor | ||||
| from datetime import datetime | ||||
| 
 | ||||
| import pandas as pd | ||||
| import requests | ||||
| import tqdm | ||||
| from bs4 import BeautifulSoup | ||||
| 
 | ||||
| logger = logging.getLogger("knack-scraper") | ||||
| # ch = logging.StreamHandler() | ||||
| # formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") | ||||
| # ch.setFormatter(formatter) | ||||
| # ch.setLevel(logging.INFO) | ||||
| # logger.addHandler(ch) | ||||
| 
 | ||||
| 
 | ||||
| def table_exists(tablename: str, con: sqlite3.Connection): | ||||
|     query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" | ||||
|     return len(con.execute(query, [tablename]).fetchall()) > 0 | ||||
| 
 | ||||
| 
 | ||||
| def download(id: int): | ||||
|     if id == 0: | ||||
|         return | ||||
|     base_url = "https://knack.news/" | ||||
|     url = f"{base_url}{id}" | ||||
|     res = requests.get(url) | ||||
| 
 | ||||
|     # make sure we don't dos knack | ||||
|     time.sleep(2) | ||||
| 
 | ||||
|     if not (200 <= res.status_code <= 300): | ||||
|         return | ||||
| 
 | ||||
|     logger.info("Found promising page with id %d!", id) | ||||
| 
 | ||||
|     content = res.content | ||||
|     soup = BeautifulSoup(content, "html.parser") | ||||
|     date_format = "%d. %B %Y" | ||||
| 
 | ||||
|     # TODO FIXME: this fails inside the docker container | ||||
|     locale.setlocale(locale.LC_TIME, "de_DE") | ||||
|     pC = soup.find("div", {"class": "postContent"}) | ||||
| 
 | ||||
|     if pC is None: | ||||
|         # not a normal post | ||||
|         logger.info( | ||||
|             "Page with id %d does not have a .pageContent-div. Skipping for now.", id | ||||
|         ) | ||||
|         return | ||||
| 
 | ||||
|     # every post has these fields | ||||
|     title = pC.find("h3", {"class": "postTitle"}).text | ||||
|     postText = pC.find("div", {"class": "postText"}) | ||||
| 
 | ||||
|     # these fields are possible but not required | ||||
|     # TODO: cleanup | ||||
|     try: | ||||
|         date_string = pC.find("span", {"class": "singledate"}).text | ||||
|         parsed_date = datetime.strptime(date_string, date_format) | ||||
|     except AttributeError: | ||||
|         parsed_date = None | ||||
| 
 | ||||
|     try: | ||||
|         author = pC.find("span", {"class": "author"}).text | ||||
|     except AttributeError: | ||||
|         author = None | ||||
| 
 | ||||
|     try: | ||||
|         category = pC.find("span", {"class": "categoryInfo"}).find().text | ||||
|     except AttributeError: | ||||
|         category = None | ||||
| 
 | ||||
|     try: | ||||
|         tags = [x.text for x in pC.find("div", {"class": "tagsInfo"}).find_all("a")] | ||||
|     except AttributeError: | ||||
|         tags = None | ||||
| 
 | ||||
|     img = pC.find("img", {"class": "postImage"}) | ||||
|     if img is not None: | ||||
|         img = img["src"] | ||||
| 
 | ||||
|     res_dict = { | ||||
|         "id": id, | ||||
|         "title": title, | ||||
|         "author": author, | ||||
|         "date": parsed_date, | ||||
|         "category": category, | ||||
|         "url": url, | ||||
|         "img_link": img, | ||||
|         "tags": tags, | ||||
|         "text": postText.text, | ||||
|         "html": str(postText), | ||||
|         "scraped_at": datetime.now(), | ||||
|     } | ||||
| 
 | ||||
|     return res_dict | ||||
| 
 | ||||
| 
 | ||||
| def run_downloads(min_id: int, max_id: int, num_threads: int = 8): | ||||
|     res = [] | ||||
| 
 | ||||
|     logger.info( | ||||
|         "Started parallel scrape of posts from id %d to id %d using %d threads.", | ||||
|         min_id, | ||||
|         max_id - 1, | ||||
|         num_threads, | ||||
|     ) | ||||
|     with ThreadPoolExecutor(max_workers=num_threads) as executor: | ||||
|         # Use a list comprehension to create a list of futures | ||||
|         futures = [executor.submit(download, i) for i in range(min_id, max_id)] | ||||
| 
 | ||||
|         for future in tqdm.tqdm( | ||||
|             futures, total=max_id - min_id | ||||
|         ):  # tqdm to track progress | ||||
|             post = future.result() | ||||
|             if post is not None: | ||||
|                 res.append(post) | ||||
| 
 | ||||
|     # sqlite can't handle lists so let's convert them to a single row csv | ||||
|     # TODO: make sure our database is properly normalized | ||||
|     df = pd.DataFrame(res) | ||||
|     df.tags = df.tags.apply(lambda x: "; ".join(x) if x is not None else None) | ||||
| 
 | ||||
|     return df | ||||
| 
 | ||||
| 
 | ||||
| def main(): | ||||
|     num_threads = int(os.environ.get("NUM_THREADS", 8)) | ||||
|     n_scrapes = int(os.environ.get("NUM_SCRAPES", 100)) | ||||
|     database_location = os.environ.get("DATABASE_LOCATION", "/data/knack.sqlite") | ||||
| 
 | ||||
|     con = sqlite3.connect(database_location) | ||||
|     with con: | ||||
|         post_table_exists = table_exists("posts", con) | ||||
| 
 | ||||
|         if post_table_exists: | ||||
|             logger.info("found posts retrieved earlier") | ||||
|             # retrieve max post id from db so | ||||
|             # we can skip retrieving known posts | ||||
|             max_id_in_db = con.execute("SELECT MAX(id) FROM posts").fetchone()[0] | ||||
|             logger.info("Got max id %d!", max_id_in_db) | ||||
|         else: | ||||
|             logger.info("no posts scraped so far - starting from 0") | ||||
|             # retrieve from 0 onwards | ||||
|             max_id_in_db = -1 | ||||
| 
 | ||||
|     con = sqlite3.connect(database_location) | ||||
|     df = run_downloads( | ||||
|         min_id=max_id_in_db + 1, | ||||
|         max_id=max_id_in_db + n_scrapes, | ||||
|         num_threads=num_threads, | ||||
|     ) | ||||
|     df.to_sql("posts", con, if_exists="append") | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
							
								
								
									
										14
									
								
								requirements.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								requirements.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,14 @@ | |||
| beautifulsoup4==4.12.2 | ||||
| certifi==2023.7.22 | ||||
| charset-normalizer==3.3.0 | ||||
| idna==3.4 | ||||
| numpy==1.26.1 | ||||
| pandas==2.1.1 | ||||
| python-dateutil==2.8.2 | ||||
| pytz==2023.3.post1 | ||||
| requests==2.31.0 | ||||
| six==1.16.0 | ||||
| soupsieve==2.5 | ||||
| tqdm==4.66.1 | ||||
| tzdata==2023.3 | ||||
| urllib3==2.0.7 | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 lukaszett
						lukaszett