diff --git a/.gitignore b/.gitignore index 8e2fba4..52e71df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ data/ venv/ experiment/ +__pycache__/ .DS_STORE .env \ No newline at end of file diff --git a/transform/Dockerfile b/transform/Dockerfile index 6f148bd..dd847a2 100644 --- a/transform/Dockerfile +++ b/transform/Dockerfile @@ -17,8 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1 ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1 -ENV MINILM_MODEL_ID=sentence-transformers/all-MiniLM-L6-v2 -ENV MINILM_MODEL_PATH=/models/all-MiniLM-L6-v2 +ENV GTE_MODEL_ID=thenlper/gte-large +ENV GTE_MODEL_PATH=/models/thenlper/gte-large WORKDIR /app COPY requirements.txt . @@ -31,16 +31,16 @@ RUN apt install -y cron locales # Ensure GLiNER helper scripts are available COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh -# Ensure MiniLM helper scripts are available -COPY ensure_minilm_model.sh /usr/local/bin/ensure_minilm_model.sh +# Ensure GTE helper scripts are available +COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh COPY entrypoint.sh /usr/local/bin/entrypoint.sh -RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_minilm_model.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh COPY *.py . # Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0 # Testing every 30 Minutes */30 * * * * -RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform +RUN echo "*/15 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform RUN chmod 0644 /etc/cron.d/knack-transform RUN crontab /etc/cron.d/knack-transform diff --git a/transform/app.log b/transform/app.log new file mode 100644 index 0000000..551ef70 --- /dev/null +++ b/transform/app.log @@ -0,0 +1,303 @@ +2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps +2026-01-18 15:11:40,254 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row +0 0 41 Über uns None ... 0 0.0 0.0 0.0 +1 1 52 Kontakt None ... 0 0.0 0.0 0.0 +2 2 99 Safety First None ... 0 0.0 0.0 0.0 +3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... chakalaka_161 ... 0 0.0 0.0 0.0 +4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... anonym ... 0 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... ... ... +95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... Soli Antifa Ost ... 0 0.0 0.0 0.0 +96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... anonym ... 0 0.0 0.0 0.0 +97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... anonym ... 0 0.0 0.0 0.0 +98 13 654 Nach der Demo ging’s bergab kreuzer online ... 0 0.0 0.0 0.0 +99 14 659 Polizistin unterhält romantische Brieffreundsc... Kira Ayyadi ... 0 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation +2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small +2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps +2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small +2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete +2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results +2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully +2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete +2026-01-18 15:11:55,335 - knack-transform - INFO - index id title ... umap_x umap_y row +0 0 41 Über uns ... 0.0 0.0 0.0 +1 1 52 Kontakt ... 0.0 0.0 0.0 +2 2 99 Safety First ... 0.0 0.0 0.0 +3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0 +4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... +95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... ... 0.0 0.0 0.0 +96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... ... 0.0 0.0 0.0 +97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... ... 0.0 0.0 0.0 +98 13 654 Nach der Demo ging’s bergab ... 0.0 0.0 0.0 +99 14 659 Polizistin unterhält romantische Brieffreundsc... ... 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None +2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation +2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays +2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows +2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192) +2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps +2026-01-18 15:15:27,968 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row +0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... LVZ ... 0 0.0 0.0 0.0 +1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... Michael Freitag ... 0 0.0 0.0 0.0 +2 17 680 Kein Verdacht Konrad Litschko & Andreas Speit ... 0 0.0 0.0 0.0 +3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... LeipzigBesetzen ... 0 0.0 0.0 0.0 +4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... interkiezionale ... 0 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... ... ... +95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... G19 und BikeKitchen Freiburg ... 0 0.0 0.0 0.0 +96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... MDR ... 0 0.0 0.0 0.0 +97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0 +98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0 +99 36 1154 23 Thesen über die Revolte – Wie können wir au... anonyme*r Mensch aus Leipzig ... 0 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation +2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small +2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps +2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small +2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete +2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results +2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully +2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete +2026-01-18 15:15:34,885 - knack-transform - INFO - index id title ... umap_x umap_y row +0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... ... 0.0 0.0 0.0 +1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... ... 0.0 0.0 0.0 +2 17 680 Kein Verdacht ... 0.0 0.0 0.0 +3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... ... 0.0 0.0 0.0 +4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... ... 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... +95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... ... 0.0 0.0 0.0 +96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... ... 0.0 0.0 0.0 +97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... ... 0.0 0.0 0.0 +98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... ... 0.0 0.0 0.0 +99 36 1154 23 Thesen über die Revolte – Wie können wir au... ... 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None +2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation +2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays +2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows +2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192) +2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer... +2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3) +2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split' +2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete +2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results +2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps +2026-01-18 15:26:34,426 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0 +1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0 +2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0 +3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0 +4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... +95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0 +96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0 +97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0 +98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0 +99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation +2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small +2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps +2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small +2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete +2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results +2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully +2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete +2026-01-18 15:26:41,115 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0 +1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0 +2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0 +3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0 +4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0 +.. ... ... ... ... ... ... ... +95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0 +96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0 +97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0 +98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0 +99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0 + +[100 rows x 17 columns] +2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None +2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation +2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows +2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays +2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows +2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192) +2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer... +2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3) +2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split' +2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete +2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results +2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully +2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete +2026-01-18 15:26:44,282 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 5.537961 3.468988 3.757369 +1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 4.980662 1.629360 3.269084 +2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 1.055900 2.460792 2.076612 +3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 4.128685 5.247468 4.904186 +4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 5.383136 2.068369 4.368077 +.. ... ... ... ... ... ... ... +95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 5.897925 5.151130 3.241154 +96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 2.919075 5.341392 4.516587 +97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 4.852142 1.179675 4.241960 +98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 5.231822 4.983705 3.941314 +99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.999596 1.613693 2.039646 + +[100 rows x 17 columns] +2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed +2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps +2026-01-18 15:28:43,420 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 0.0 0.0 0.0 +1 2 52 Kontakt ... 0.0 0.0 0.0 +2 3 99 Safety First ... 0.0 0.0 0.0 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0 +3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0 +3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0 + +[3678 rows x 17 columns] +2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation +2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows +2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small +2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps +2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small +2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete +2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results +2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully +2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete +2026-01-18 15:30:42,374 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 0.0 0.0 0.0 +1 2 52 Kontakt ... 0.0 0.0 0.0 +2 3 99 Safety First ... 0.0 0.0 0.0 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0 +3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0 +3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0 + +[3678 rows x 17 columns] +2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None +2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation +2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows +2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays +2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows +2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192) +2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer... +2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3) +2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split' +2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete +2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results +2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully +2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete +2026-01-18 15:31:00,255 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed +2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps +2026-01-18 15:35:37,186 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation +2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows +2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small +2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps +2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small +2026-01-18 15:36:25,468 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:37:37,881 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed +2026-01-18 15:39:23,498 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:39:52,241 - knack-transform - INFO - index id title ... umap_x umap_y umap_z +0 1 41 Über uns ... 6.138411 7.582617 9.574329 +1 2 52 Kontakt ... 6.801492 5.409409 4.112970 +2 3 99 Safety First ... 9.410303 7.564034 8.076056 +3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393 +4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834 +... ... ... ... ... ... ... ... +3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534 +3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499 +3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753 +3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699 +3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024 + +[3678 rows x 17 columns] +2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed diff --git a/transform/embeddings_node.py b/transform/embeddings_node.py index 9821eca..aca8174 100644 --- a/transform/embeddings_node.py +++ b/transform/embeddings_node.py @@ -13,16 +13,20 @@ import pandas as pd import logging import os import numpy as np +import sys +import pickle +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D logger = logging.getLogger("knack-transform") try: from sentence_transformers import SentenceTransformer import torch - MINILM_AVAILABLE = True + GTE_AVAILABLE = True except ImportError: - MINILM_AVAILABLE = False - logging.warning("MiniLM not available. Install with pip!") + GTE_AVAILABLE = False + logging.warning("GTE not available. Install with pip!") try: import umap @@ -36,7 +40,7 @@ class TextEmbeddingNode(TransformNode): of posts. """ def __init__(self, - model_name: str = "sentence-transformers/all-MiniLM-L6-v2", + model_name: str = "thenlper/gte-small", model_path: str = None, device: str = "cpu"): """Initialize the ExampleNode. @@ -47,27 +51,27 @@ class TextEmbeddingNode(TransformNode): device: Device to use for computations ('cpu', 'cuda', 'mps') """ self.model_name = model_name - self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH') + self.model_path = model_path or os.environ.get('GTE_MODEL_PATH') self.device = device self.model = None logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}") def _setup_model(self): """Init the Text Embedding Model.""" - if not MINILM_AVAILABLE: - raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.") + if not GTE_AVAILABLE: + raise ImportError("GTE is required for TextEmbeddingNode. Please install.") model_source = None if self.model_path: if os.path.exists(self.model_path): model_source = self.model_path - logger.info(f"Loading MiniLM model from local path: {self.model_path}") + logger.info(f"Loading GTE model from local path: {self.model_path}") else: - logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}") + logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}") if model_source is None: model_source = self.model_name - logger.info(f"Loading MiniLM model from the hub: {self.model_name}") + logger.info(f"Loading GTE model from the hub: {self.model_name}") if self.device == "cuda" and torch.cuda.is_available(): self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16) @@ -97,7 +101,7 @@ class TextEmbeddingNode(TransformNode): # Example: Add a new column based on existing data result_df = df.copy() - df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True)) + result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True)) logger.info("Processing complete") return result_df @@ -111,8 +115,7 @@ class TextEmbeddingNode(TransformNode): logger.info(f"Storing {len(df)} results") # Convert numpy arrays to bytes for BLOB storage - # Use tobytes() to serialize numpy arrays efficiently - updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()] + updates = [(row['embedding'], row['id']) for _, row in df.iterrows()] con.executemany( "UPDATE posts SET embedding = ? WHERE id = ?", updates @@ -167,11 +170,12 @@ class UmapNode(TransformNode): """ def __init__(self, - n_neighbors: int = 15, + n_neighbors: int = 10, min_dist: float = 0.1, - n_components: int = 2, + n_components: int = 3, metric: str = "cosine", - random_state: int = 42): + random_state: int = 42, + model_path: str = None): """Initialize the UmapNode. Args: @@ -180,15 +184,18 @@ class UmapNode(TransformNode): n_components: Number of dimensions to reduce to (default: 2) metric: Distance metric to use (default: 'cosine') random_state: Random seed for reproducibility (default: 42) + model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl') """ self.n_neighbors = n_neighbors self.min_dist = min_dist self.n_components = n_components self.metric = metric self.random_state = random_state + self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH') self.reducer = None logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, " - f"n_components={n_components}, metric={metric}, random_state={random_state}") + f"n_components={n_components}, metric={metric}, random_state={random_state}, " + f"model_path={self.model_path}") def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: """Process the input dataframe. @@ -231,26 +238,53 @@ class UmapNode(TransformNode): embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values) logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}") - # Apply UMAP - logger.info("Fitting UMAP reducer...") - self.reducer = umap.UMAP( - n_neighbors=self.n_neighbors, - min_dist=self.min_dist, - n_components=self.n_components, - metric=self.metric, - random_state=self.random_state - ) + # Check if a saved UMAP model exists + if self.model_path and os.path.exists(self.model_path): + logger.info(f"Loading existing UMAP model from {self.model_path}") + try: + with open(self.model_path, 'rb') as f: + self.reducer = pickle.load(f) + logger.info("UMAP model loaded successfully") + umap_coords = self.reducer.transform(embeddings_matrix) + logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}") + except Exception as e: + logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}") + logger.info("Falling back to fitting a new model") + self.reducer = None - umap_coords = self.reducer.fit_transform(embeddings_matrix) - logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}") + # If no saved model or loading failed, fit a new model + if self.reducer is None: + logger.info("Fitting new UMAP reducer...") + self.reducer = umap.UMAP( + n_neighbors=self.n_neighbors, + min_dist=self.min_dist, + n_components=self.n_components, + metric=self.metric, + random_state=self.random_state + ) + + umap_coords = self.reducer.fit_transform(embeddings_matrix) + logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}") + + # Save the fitted model + try: + umap_folder = '/'.join(self.model_path.split('/')[:1]) + os.mkdir(umap_folder) + with open(self.model_path, 'wb') as f: + pickle.dump(self.reducer, f) + logger.info(f"UMAP model saved to {self.model_path}") + except Exception as e: + logger.error(f"Failed to save UMAP model to {self.model_path}: {e}") # Add UMAP coordinates to dataframe result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0] result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1] + result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2] # Fill NaN for invalid rows - result_df['umap_x'] = result_df['umap_x'].fillna(None) - result_df['umap_y'] = result_df['umap_y'].fillna(None) + result_df['umap_x'] = result_df['umap_x'].fillna(value=0) + result_df['umap_y'] = result_df['umap_y'].fillna(value=0) + result_df['umap_z'] = result_df['umap_z'].fillna(value=0) logger.info("Processing complete") return result_df @@ -270,14 +304,14 @@ class UmapNode(TransformNode): # Batch update UMAP coordinates updates = [ - (row['umap_x'], row['umap_y'], row['id']) + (row['umap_x'], row['umap_y'], row['umap_z'], row['id']) for _, row in df.iterrows() - if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) + if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z')) ] if updates: con.executemany( - "UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?", + "UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?", updates ) con.commit() @@ -443,3 +477,60 @@ class SimilarityNode(TransformNode): # Return new context with results return TransformContext(result_df) + +def main(): + + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("app.log"), + logging.StreamHandler(sys.stdout) + ] + ) + logger = logging.getLogger("knack-transform") + + con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite") + df = pd.read_sql('select * from posts;', con) + #node = TextEmbeddingNode(device='mps') + #context = TransformContext(df) + + logger.info(df) + #new_context = node.run(con, context) + #logger.info(new_context.get_dataframe()) + + #umapNode = UmapNode() + #new_context = umapNode.run(con, new_context) + + #logger.info(new_context.get_dataframe()) + + # Create 3D scatter plot of UMAP coordinates + result_df = df + + fig = plt.figure(figsize=(12, 9)) + ax = fig.add_subplot(111, projection='3d') + + scatter = ax.scatter( + result_df['umap_x'], + result_df['umap_y'], + result_df['umap_z'], + c=result_df['id'], + cmap='viridis', + alpha=0.6, + s=50 + ) + + ax.set_xlabel('UMAP X') + ax.set_ylabel('UMAP Y') + ax.set_zlabel('UMAP Z') + ax.set_title('3D UMAP Visualization of Post Embeddings') + + plt.colorbar(scatter, ax=ax, label='Post Index') + plt.tight_layout() + plt.show() + + logger.info("3D plot displayed") + + +if __name__ == '__main__': + main() diff --git a/transform/ensure_gte_model.sh b/transform/ensure_gte_model.sh new file mode 100644 index 0000000..41addd4 --- /dev/null +++ b/transform/ensure_gte_model.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [ -d "$GTE_MODEL_PATH" ] && find "$GTE_MODEL_PATH" -type f | grep -q .; then + echo "GTE model already present at $GTE_MODEL_PATH" + exit 0 +fi + +echo "Downloading GTE model to $GTE_MODEL_PATH" +mkdir -p "$GTE_MODEL_PATH" +curl -sL "https://huggingface.co/api/models/${GTE_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do + target="${GTE_MODEL_PATH}/${file}" + mkdir -p "$(dirname "$target")" + echo "Downloading ${file}" + curl -sL "https://huggingface.co/${GTE_MODEL_ID}/resolve/main/${file}" -o "$target" +done diff --git a/transform/ensure_minilm_model.sh b/transform/ensure_minilm_model.sh deleted file mode 100644 index 2d58f24..0000000 --- a/transform/ensure_minilm_model.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -if [ -d "$MINILM_MODEL_PATH" ] && find "$MINILM_MODEL_PATH" -type f | grep -q .; then - echo "MiniLM model already present at $MINILM_MODEL_PATH" - exit 0 -fi - -echo "Downloading MiniLM model to $MINILM_MODEL_PATH" -mkdir -p "$MINILM_MODEL_PATH" -curl -sL "https://huggingface.co/api/models/${MINILM_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do - target="${MINILM_MODEL_PATH}/${file}" - mkdir -p "$(dirname "$target")" - echo "Downloading ${file}" - curl -sL "https://huggingface.co/${MINILM_MODEL_ID}/resolve/main/${file}" -o "$target" -done diff --git a/transform/entrypoint.sh b/transform/entrypoint.sh index 96f5932..f5f79c4 100644 --- a/transform/entrypoint.sh +++ b/transform/entrypoint.sh @@ -2,7 +2,7 @@ set -euo pipefail # Run model download with output to stdout/stderr -/usr/local/bin/ensure_minilm_model.sh 2>&1 +/usr/local/bin/ensure_gte_model.sh 2>&1 /usr/local/bin/ensure_gliner_model.sh 2>&1 # Start cron in foreground with logging diff --git a/transform/pipeline.py b/transform/pipeline.py index e1f4e9c..5a499c7 100644 --- a/transform/pipeline.py +++ b/transform/pipeline.py @@ -241,7 +241,7 @@ def create_default_pipeline(device: str = "cpu", node_class=TextEmbeddingNode, node_kwargs={ 'device': device, - 'model_path': os.environ.get('MINILM_MODEL_PATH') + 'model_path': os.environ.get('GTE_MODEL_PATH') }, dependencies=[], name='TextEmbeddingNode' diff --git a/visualisation/environment.yml b/visualisation/environment.yml new file mode 100644 index 0000000..2af896c --- /dev/null +++ b/visualisation/environment.yml @@ -0,0 +1,13 @@ +name: knack-viz +channels: + - conda-forge + - defaults +dependencies: + - python=3.11 + - pandas>=2.0.0 + - altair>=5.0.0 + - notebook + - ipykernel + - pip + - pip: + - vega_datasets diff --git a/visualisation/knack_visualization.ipynb b/visualisation/knack_visualization.ipynb new file mode 100644 index 0000000..4af5a03 --- /dev/null +++ b/visualisation/knack_visualization.ipynb @@ -0,0 +1,1381 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8495708c", + "metadata": {}, + "source": [ + "# Knack Database Visualization\n", + "\n", + "This notebook explores and visualizes the findings from the `knack.sqlite` database using Altair for interactive data visualizations." + ] + }, + { + "cell_type": "markdown", + "id": "75cdd349", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries\n", + "\n", + "Import necessary libraries for data manipulation and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c99dde85", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Libraries imported successfully!\n" + ] + } + ], + "source": [ + "import sqlite3\n", + "import pandas as pd\n", + "import altair as alt\n", + "from pathlib import Path\n", + "\n", + "# Configure Altair\n", + "alt.data_transformers.disable_max_rows()\n", + "alt.renderers.enable('default')\n", + "\n", + "print(\"Libraries imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "198121f5", + "metadata": {}, + "source": [ + "## 2. Connect to SQLite Database\n", + "\n", + "Establish connection to the knack.sqlite database and explore its structure." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "98ddc787", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tables in the database:\n", + " - posts\n", + " - posttags\n", + " - postcategories\n", + " - tags\n", + " - categories\n", + " - authors\n", + " - post_authors\n" + ] + } + ], + "source": [ + "# Connect to the database\n", + "db_path = Path('../data/knack.transformed.sqlite')\n", + "conn = sqlite3.connect(db_path)\n", + "cursor = conn.cursor()\n", + "\n", + "# Get all table names\n", + "cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", + "tables = cursor.fetchall()\n", + "\n", + "print(\"Tables in the database:\")\n", + "for table in tables:\n", + " print(f\" - {table[0]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4f216388", + "metadata": {}, + "source": [ + "## 3. Explore Database Schema\n", + "\n", + "Examine the structure of each table to understand the data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e51dd105", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Table: posts\n", + "============================================================\n", + "\n", + "Columns:\n", + " index INTEGER \n", + " id INTEGER \n", + " title TEXT \n", + " author TEXT \n", + " date TIMESTAMP \n", + " category TEXT \n", + " url TEXT \n", + " img_link TEXT \n", + " tags TEXT \n", + " text TEXT \n", + " html TEXT \n", + " scraped_at TIMESTAMP \n", + " is_cleaned BOOLEAN \n", + " embedding BLOB \n", + " umap_x REAL \n", + " umap_y REAL \n", + "\n", + "Total rows: 3678\n", + "\n", + "============================================================\n", + "Table: posttags\n", + "============================================================\n", + "\n", + "Columns:\n", + " post_id INTEGER \n", + " tag_id INTEGER \n", + "\n", + "Total rows: 14272\n", + "\n", + "============================================================\n", + "Table: postcategories\n", + "============================================================\n", + "\n", + "Columns:\n", + " post_id INTEGER \n", + " category_id INTEGER \n", + "\n", + "Total rows: 3691\n", + "\n", + "============================================================\n", + "Table: tags\n", + "============================================================\n", + "\n", + "Columns:\n", + " id INTEGER \n", + " tag TEXT \n", + "\n", + "Total rows: 64\n", + "\n", + "============================================================\n", + "Table: categories\n", + "============================================================\n", + "\n", + "Columns:\n", + " id INTEGER \n", + " category TEXT \n", + "\n", + "Total rows: 6\n", + "\n", + "============================================================\n", + "Table: authors\n", + "============================================================\n", + "\n", + "Columns:\n", + " id INTEGER \n", + " name TEXT \n", + " type TEXT \n", + " created_at TIMESTAMP \n", + "\n", + "Total rows: 1143\n", + "\n", + "============================================================\n", + "Table: post_authors\n", + "============================================================\n", + "\n", + "Columns:\n", + " post_id INTEGER \n", + " author_id INTEGER \n", + "\n", + "Total rows: 4934\n" + ] + } + ], + "source": [ + "# Examine schema for each table\n", + "for table in tables:\n", + " table_name = table[0]\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"Table: {table_name}\")\n", + " print('='*60)\n", + " \n", + " # Get column information\n", + " cursor.execute(f\"PRAGMA table_info({table_name})\")\n", + " columns = cursor.fetchall()\n", + " \n", + " print(\"\\nColumns:\")\n", + " for col in columns:\n", + " print(f\" {col[1]:20} {col[2]:15} {'NOT NULL' if col[3] else ''}\")\n", + " \n", + " # Get row count\n", + " cursor.execute(f\"SELECT COUNT(*) FROM {table_name}\")\n", + " count = cursor.fetchone()[0]\n", + " print(f\"\\nTotal rows: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "25ffce32", + "metadata": {}, + "source": [ + "## 4. Load Data from Database\n", + "\n", + "Load the data from tables into pandas DataFrames for analysis and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1459d68a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded posts: 3678 rows, 16 columns\n", + "Loaded posttags: 14272 rows, 2 columns\n", + "Loaded postcategories: 3691 rows, 2 columns\n", + "Loaded tags: 64 rows, 2 columns\n", + "Loaded categories: 6 rows, 2 columns\n", + "Loaded authors: 1143 rows, 4 columns\n", + "Loaded post_authors: 4934 rows, 2 columns\n", + "\n", + "Available dataframes: ['posts', 'posttags', 'postcategories', 'tags', 'categories', 'authors', 'post_authors']\n" + ] + } + ], + "source": [ + "# Load all tables into DataFrames\n", + "dataframes = {}\n", + "\n", + "for table in tables:\n", + " table_name = table[0]\n", + " query = f\"SELECT * FROM {table_name}\"\n", + " df = pd.read_sql_query(query, conn)\n", + " dataframes[table_name] = df\n", + " print(f\"Loaded {table_name}: {df.shape[0]} rows, {df.shape[1]} columns\")\n", + "\n", + "# Display available dataframes\n", + "print(f\"\\nAvailable dataframes: {list(dataframes.keys())}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c34b1bc5", + "metadata": {}, + "source": [ + "## 5. Explore Data Structure\n", + "\n", + "Examine the first dataframe to understand the data better." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "91616185", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exploring: posts\n", + "\n", + "Shape: (3678, 16)\n", + "\n", + "Data types:\n", + "index int64\n", + "id int64\n", + "title object\n", + "author object\n", + "date object\n", + "category object\n", + "url object\n", + "img_link object\n", + "tags object\n", + "text object\n", + "html object\n", + "scraped_at object\n", + "is_cleaned int64\n", + "embedding object\n", + "umap_x float64\n", + "umap_y float64\n", + "dtype: object\n", + "\n", + "Missing values:\n", + "index 0\n", + "id 0\n", + "title 0\n", + "author 3\n", + "date 3\n", + "category 3\n", + "url 0\n", + "img_link 148\n", + "tags 4\n", + "text 0\n", + "html 0\n", + "scraped_at 0\n", + "is_cleaned 0\n", + "embedding 0\n", + "umap_x 0\n", + "umap_y 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Select the first table to explore (or specify a specific table)\n", + "if dataframes:\n", + " first_table = list(dataframes.keys())[0]\n", + " df = dataframes[first_table]\n", + " \n", + " print(f\"Exploring: {first_table}\")\n", + " print(f\"\\nShape: {df.shape}\")\n", + " print(f\"\\nData types:\\n{df.dtypes}\")\n", + " \n", + " print(f\"\\nMissing values:\")\n", + " print(df.isnull().sum())" + ] + }, + { + "cell_type": "markdown", + "id": "f9b0e8d7", + "metadata": {}, + "source": [ + "## 7. Create Time Series Visualizations\n", + "\n", + "If the data contains temporal information, create time series visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2190a06b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found potential date columns: ['date']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/j5/hpq7xq6x1p18cds26_lb_3gr0000gn/T/ipykernel_46007/4118830821.py:19: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.\n", + " time_series = df.groupby(pd.Grouper(key=date_col, freq='M')).size().reset_index(name='count')\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Check for date/time columns and create time series visualizations\n", + "if dataframes:\n", + " df = dataframes[list(dataframes.keys())[0]]\n", + " \n", + " # Look for columns that might contain dates (check column names)\n", + " date_like_cols = [col for col in df.columns if any(\n", + " keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'timestamp']\n", + " )]\n", + " \n", + " if date_like_cols:\n", + " print(f\"Found potential date columns: {date_like_cols}\")\n", + " \n", + " # Try to convert the first date-like column to datetime\n", + " date_col = date_like_cols[0]\n", + " try:\n", + " df[date_col] = pd.to_datetime(df[date_col], errors='coerce')\n", + " \n", + " # Create a time series chart - count records over time\n", + " time_series = df.groupby(pd.Grouper(key=date_col, freq='M')).size().reset_index(name='count')\n", + " \n", + " chart = alt.Chart(time_series).mark_line(point=True).encode(\n", + " x=alt.X(f'{date_col}:T', title='Date'),\n", + " y=alt.Y('count:Q', title='Count'),\n", + " tooltip=[date_col, 'count']\n", + " ).properties(\n", + " title=f'Records Over Time',\n", + " width=700,\n", + " height=400\n", + " ).interactive()\n", + " \n", + " display(chart)\n", + " except Exception as e:\n", + " print(f\"Could not create time series chart: {e}\")\n", + " else:\n", + " print(\"No date/time columns found\")" + ] + }, + { + "cell_type": "markdown", + "id": "793026df", + "metadata": {}, + "source": [ + "### Articles per Category\n", + "\n", + "Visualize the distribution of articles across different categories." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "22c47b71", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['posts', 'posttags', 'postcategories', 'tags', 'categories', 'authors', 'post_authors'])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframes.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ac9fae5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total categories: 6\n", + "Most articles in a category: 2098\n", + "Average articles per category: 615.17\n" + ] + } + ], + "source": [ + "# Check if categorisation data exists and create histogram\n", + "if 'postcategories' in dataframes and 'categories' in dataframes:\n", + " df_post_cat = dataframes['postcategories']\n", + " df_categories = dataframes['categories']\n", + " \n", + " # Join postcategories with categories to get category names\n", + " if 'category_id' in df_post_cat.columns and 'id' in df_categories.columns and 'category' in df_categories.columns:\n", + " # Merge the two tables\n", + " df_merged = df_post_cat.merge(\n", + " df_categories[['id', 'category']], \n", + " left_on='category_id', \n", + " right_on='id',\n", + " how='left'\n", + " )\n", + " \n", + " # Count articles per category\n", + " category_counts = df_merged['category'].value_counts().reset_index()\n", + " category_counts.columns = ['category', 'article_count']\n", + " \n", + " # Sort by count descending\n", + " category_counts = category_counts.sort_values('article_count', ascending=False)\n", + " \n", + " chart = alt.Chart(category_counts).mark_bar().encode(\n", + " x=alt.X('category:N', sort='-y', title='Category', axis=alt.Axis(labelAngle=-45)),\n", + " y=alt.Y('article_count:Q', title='Number of Articles'),\n", + " color=alt.Color('article_count:Q', scale=alt.Scale(scheme='viridis'), legend=None),\n", + " tooltip=['category', alt.Tooltip('article_count:Q', title='Articles')]\n", + " ).properties(\n", + " title='Distribution of Articles per Category',\n", + " width=700,\n", + " height=450\n", + " ).interactive()\n", + " \n", + " display(chart)\n", + " \n", + " # Show summary statistics\n", + " print(f\"\\nTotal categories: {len(category_counts)}\")\n", + " print(f\"Most articles in a category: {category_counts['article_count'].max()}\")\n", + " print(f\"Average articles per category: {category_counts['article_count'].mean():.2f}\")\n", + " else:\n", + " print(\"Could not find required columns for joining tables\")\n", + "else:\n", + " print(\"Need both 'postcategories' and 'categories' tables in database\")" + ] + }, + { + "cell_type": "markdown", + "id": "56c89ec3", + "metadata": {}, + "source": [ + "### Articles per Tag\n", + "\n", + "Visualize the distribution of articles across different tags." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "95a28c5f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total tags: 64\n", + "Most articles with a tag: 1954\n", + "Average articles per tag: 223.00\n", + "Median articles per tag: 101.50\n" + ] + } + ], + "source": [ + "# Check if tag data exists and create histogram\n", + "if 'posttags' in dataframes and 'tags' in dataframes:\n", + " df_post_tags = dataframes['posttags']\n", + " df_tags = dataframes['tags']\n", + " \n", + " # Join posttags with tags to get tag names\n", + " if 'tag_id' in df_post_tags.columns and 'id' in df_tags.columns and 'tag' in df_tags.columns:\n", + " # Merge the two tables\n", + " df_merged = df_post_tags.merge(\n", + " df_tags[['id', 'tag']], \n", + " left_on='tag_id', \n", + " right_on='id',\n", + " how='left'\n", + " )\n", + " \n", + " # Count articles per tag\n", + " tag_counts = df_merged['tag'].value_counts().reset_index()\n", + " tag_counts.columns = ['tag', 'article_count']\n", + " \n", + " # Show top 30 tags for readability\n", + " tag_counts_top = tag_counts.head(30).sort_values('article_count', ascending=False)\n", + " \n", + " chart = alt.Chart(tag_counts_top).mark_bar().encode(\n", + " x=alt.X('tag:N', sort='-y', title='Tag', axis=alt.Axis(labelAngle=-45)),\n", + " y=alt.Y('article_count:Q', title='Number of Articles'),\n", + " color=alt.Color('article_count:Q', scale=alt.Scale(scheme='oranges'), legend=None),\n", + " tooltip=['tag', alt.Tooltip('article_count:Q', title='Articles')]\n", + " ).properties(\n", + " title='Distribution of Articles per Tag (Top 30)',\n", + " width=700,\n", + " height=450\n", + " ).interactive()\n", + " \n", + " display(chart)\n", + " \n", + " # Show summary statistics\n", + " print(f\"\\nTotal tags: {len(tag_counts)}\")\n", + " print(f\"Most articles with a tag: {tag_counts['article_count'].max()}\")\n", + " print(f\"Average articles per tag: {tag_counts['article_count'].mean():.2f}\")\n", + " print(f\"Median articles per tag: {tag_counts['article_count'].median():.2f}\")\n", + " else:\n", + " print(\"Could not find required columns for joining tables\")\n", + "else:\n", + " print(\"Need both 'posttags' and 'tags' tables in database\")" + ] + }, + { + "cell_type": "markdown", + "id": "549e6f38", + "metadata": {}, + "source": [ + "### Articles per Author\n", + "\n", + "Visualize the distribution of articles across different authors." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a49be6f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total authors: 1126\n", + "Most articles with a author: 700\n", + "Average articles per author: 4.38\n", + "Median articles per author: 1.00\n" + ] + } + ], + "source": [ + "# Check if author data exists and create histogram\n", + "if 'post_authors' in dataframes and 'authors' in dataframes:\n", + " df_post_tags = dataframes['post_authors']\n", + " df_tags = dataframes['authors']\n", + " \n", + " # Join posttags with tags to get tag names\n", + " if 'author_id' in df_post_tags.columns and 'id' in df_tags.columns and 'name' in df_tags.columns:\n", + " # Merge the two tables\n", + " df_merged = df_post_tags.merge(\n", + " df_tags[['id', 'name']], \n", + " left_on='author_id', \n", + " right_on='id',\n", + " how='left'\n", + " )\n", + " \n", + " # Count articles per tag\n", + " tag_counts = df_merged['name'].value_counts().reset_index()\n", + " tag_counts.columns = ['author', 'article_count']\n", + " \n", + " # Show top 30 tags for readability\n", + " tag_counts_top = tag_counts.head(30).sort_values('article_count', ascending=False)\n", + " \n", + " chart = alt.Chart(tag_counts_top).mark_bar().encode(\n", + " x=alt.X('author:N', sort='-y', title='Author', axis=alt.Axis(labelAngle=-45)),\n", + " y=alt.Y('article_count:Q', title='Number of Articles'),\n", + " color=alt.Color('article_count:Q', scale=alt.Scale(scheme='oranges'), legend=None),\n", + " tooltip=['author', alt.Tooltip('article_count:Q', title='Articles')]\n", + " ).properties(\n", + " title='Distribution of Articles per Author (Top 30)',\n", + " width=700,\n", + " height=450\n", + " ).interactive()\n", + " \n", + " display(chart)\n", + " \n", + " # Show summary statistics\n", + " print(f\"\\nTotal authors: {len(tag_counts)}\")\n", + " print(f\"Most articles with a author: {tag_counts['article_count'].max()}\")\n", + " print(f\"Average articles per author: {tag_counts['article_count'].mean():.2f}\")\n", + " print(f\"Median articles per author: {tag_counts['article_count'].median():.2f}\")\n", + " else:\n", + " print(\"Could not find required columns for joining tables\")\n", + "else:\n", + " print(\"Need both 'post_authors' and 'authors' tables in database\")" + ] + }, + { + "cell_type": "markdown", + "id": "7f6f1539", + "metadata": {}, + "source": [ + "### UMAP Visualization\n", + "\n", + "Visualize the UMAP dimensionality reduction in 2D space." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "196be503", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found UMAP coordinates in table: posts\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total points: 5021\n", + "Unique authors: 1127\n", + "Top 15 authors shown in legend (others grouped as 'Other')\n" + ] + } + ], + "source": [ + "# Check for UMAP coordinates and create scatter plot with author coloring\n", + "umap_found = False\n", + "\n", + "# Look for tables with umap_x and umap_y columns\n", + "for table_name, df in dataframes.items():\n", + " if 'umap_x' in df.columns and 'umap_y' in df.columns:\n", + " print(f\"Found UMAP coordinates in table: {table_name}\")\n", + " umap_found = True\n", + " \n", + " # Check if we can join with authors\n", + " if 'posts' in dataframes and 'post_authors' in dataframes and 'authors' in dataframes:\n", + " df_posts = dataframes['posts']\n", + " df_post_authors = dataframes['post_authors']\n", + " df_authors = dataframes['authors']\n", + " \n", + " # Check if the current table has necessary columns for joining\n", + " if 'id' in df.columns or 'post_id' in df.columns:\n", + " post_id_col = 'id' if 'id' in df.columns else 'post_id'\n", + " \n", + " # Start with posts table that has UMAP coordinates\n", + " df_umap = df[[post_id_col, 'umap_x', 'umap_y']].dropna(subset=['umap_x', 'umap_y'])\n", + " \n", + " # Join with post_authors to get author_id\n", + " if 'post_id' in df_post_authors.columns and 'author_id' in df_post_authors.columns:\n", + " df_umap = df_umap.merge(\n", + " df_post_authors[['post_id', 'author_id']],\n", + " left_on=post_id_col,\n", + " right_on='post_id',\n", + " how='left'\n", + " )\n", + " \n", + " # Join with authors to get author name\n", + " if 'id' in df_authors.columns and 'name' in df_authors.columns:\n", + " df_umap = df_umap.merge(\n", + " df_authors[['id', 'name']],\n", + " left_on='author_id',\n", + " right_on='id',\n", + " how='left'\n", + " )\n", + " \n", + " # Rename name column to author for clarity\n", + " df_umap = df_umap.rename(columns={'name': 'author'})\n", + " \n", + " # Fill missing authors with 'Unknown'\n", + " df_umap['author'] = df_umap['author'].fillna('Unknown')\n", + " \n", + " # Get top 15 authors by count for better visualization\n", + " top_authors = df_umap['author'].value_counts().head(15).index.tolist()\n", + " df_umap['author_group'] = df_umap['author'].apply(\n", + " lambda x: x if x in top_authors else 'Other'\n", + " )\n", + " \n", + " # Create scatter plot with author coloring\n", + " scatter = alt.Chart(df_umap).mark_circle(size=40, opacity=0.7).encode(\n", + " x=alt.X('umap_x:Q', title='UMAP Dimension 1'),\n", + " y=alt.Y('umap_y:Q', title='UMAP Dimension 2'),\n", + " color=alt.Color('author_group:N', title='Author', scale=alt.Scale(scheme='tableau20')),\n", + " tooltip=['author', 'umap_x', 'umap_y']\n", + " ).properties(\n", + " title='UMAP 2D Projection by Author',\n", + " width=800,\n", + " height=600\n", + " ).interactive()\n", + " \n", + " display(scatter)\n", + " \n", + " print(f\"\\nTotal points: {len(df_umap)}\")\n", + " print(f\"Unique authors: {df_umap['author'].nunique()}\")\n", + " print(f\"Top 15 authors shown in legend (others grouped as 'Other')\")\n", + " else:\n", + " print(\"Could not find required columns in authors table\")\n", + " else:\n", + " print(\"Could not find required columns in post_authors table\")\n", + " else:\n", + " print(f\"Could not find post_id column in {table_name} table\")\n", + " else:\n", + " # Fallback: create plot without author coloring\n", + " df_umap = df[['umap_x', 'umap_y']].dropna()\n", + " \n", + " scatter = alt.Chart(df_umap).mark_circle(size=30, opacity=0.6).encode(\n", + " x=alt.X('umap_x:Q', title='UMAP Dimension 1'),\n", + " y=alt.Y('umap_y:Q', title='UMAP Dimension 2'),\n", + " tooltip=['umap_x', 'umap_y']\n", + " ).properties(\n", + " title='UMAP 2D Projection',\n", + " width=700,\n", + " height=600\n", + " ).interactive()\n", + " \n", + " display(scatter)\n", + " \n", + " print(f\"\\nTotal points: {len(df_umap)}\")\n", + " print(\"Note: Author coloring not available (missing required tables)\")\n", + " \n", + " break\n", + "\n", + "if not umap_found:\n", + " print(\"No UMAP coordinates (umap_x, umap_y) found in any table\")" + ] + }, + { + "cell_type": "markdown", + "id": "c57a57fa", + "metadata": {}, + "source": [ + "### 3D Embedding Visualization\n", + "\n", + "Visualize the high-dimensional embeddings in 3D space using PCA for dimensionality reduction.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "42352fef", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found embedding column in posts table\n", + "No valid embeddings found\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import plotly.graph_objects as go\n", + "import json\n", + "\n", + "# Check if posts table has embedding column\n", + "if 'posts' in dataframes:\n", + " df_posts = dataframes['posts']\n", + " \n", + " if 'embedding' in df_posts.columns:\n", + " print(\"Found embedding column in posts table\")\n", + " \n", + " # Extract embeddings and convert to array\n", + " embeddings_3d = []\n", + " valid_indices = []\n", + " \n", + " for idx, embedding in enumerate(df_posts['embedding']):\n", + " try:\n", + " # Handle different embedding formats (string, list, array, bytes)\n", + " if isinstance(embedding, bytes):\n", + " emb_array = np.array(json.loads(embedding.decode('utf-8')))\n", + " elif isinstance(embedding, str):\n", + " emb_array = np.array(json.loads(embedding))\n", + " elif isinstance(embedding, (list, tuple)):\n", + " emb_array = np.array(embedding)\n", + " else:\n", + " emb_array = embedding\n", + " \n", + " if emb_array is not None and len(emb_array) >= 3:\n", + " # Take only the first 3 dimensions\n", + " embeddings_3d.append(emb_array[:3])\n", + " valid_indices.append(idx)\n", + " except Exception as e:\n", + " continue\n", + " \n", + " if embeddings_3d:\n", + " # Convert to numpy array and ensure it's 2D (n_embeddings, 3)\n", + " embeddings_3d = np.array(embeddings_3d)\n", + " if embeddings_3d.ndim == 1:\n", + " embeddings_3d = embeddings_3d.reshape(-1, 3)\n", + " print(f\"Extracted {len(embeddings_3d)} embeddings with shape {embeddings_3d.shape}\")\n", + " \n", + " # Create a dataframe with 3D coordinates\n", + " df_3d = pd.DataFrame({\n", + " 'dim_1': embeddings_3d[:, 0],\n", + " 'dim_2': embeddings_3d[:, 1],\n", + " 'dim_3': embeddings_3d[:, 2]\n", + " })\n", + " \n", + " # Try to add author information\n", + " if 'post_authors' in dataframes and 'authors' in dataframes:\n", + " try:\n", + " df_post_authors = dataframes['post_authors']\n", + " df_authors = dataframes['authors']\n", + " \n", + " # Get author names for valid indices\n", + " authors = []\n", + " for idx in valid_indices:\n", + " post_id = df_posts.iloc[idx]['id'] if 'id' in df_posts.columns else None\n", + " if post_id is not None:\n", + " author_rows = df_post_authors[df_post_authors['post_id'] == post_id]\n", + " if not author_rows.empty:\n", + " author_id = author_rows.iloc[0]['author_id']\n", + " author_name = df_authors[df_authors['id'] == author_id]['name'].values\n", + " authors.append(author_name[0] if len(author_name) > 0 else 'Unknown')\n", + " else:\n", + " authors.append('Unknown')\n", + " else:\n", + " authors.append('Unknown')\n", + " \n", + " df_3d['author'] = authors\n", + " \n", + " # Get top 10 authors for coloring\n", + " top_authors = df_3d['author'].value_counts().head(10).index.tolist()\n", + " df_3d['author_group'] = df_3d['author'].apply(\n", + " lambda x: x if x in top_authors else 'Other'\n", + " )\n", + " \n", + " # Create 3D scatter plot with Plotly\n", + " fig = go.Figure(data=[go.Scatter3d(\n", + " x=df_3d['dim_1'],\n", + " y=df_3d['dim_2'],\n", + " z=df_3d['dim_3'],\n", + " mode='markers',\n", + " marker=dict(\n", + " size=4,\n", + " color=[top_authors.index(author) if author in top_authors else len(top_authors) \n", + " for author in df_3d['author_group']],\n", + " colorscale='Viridis',\n", + " showscale=True,\n", + " colorbar=dict(title=\"Author Group\"),\n", + " opacity=0.7\n", + " ),\n", + " text=df_3d['author'],\n", + " hovertemplate='%{text}