diff --git a/src/create_dataset.py b/src/create_dataset.py
new file mode 100644
index 0000000..2c90e8b
--- /dev/null
+++ b/src/create_dataset.py
@@ -0,0 +1,11 @@
+import pandas as pd
+
+tweets_path = "../raw_data/tweets.csv"
+
+tweets = pd.read_csv(tweets_path)
+
+# TODO extract mentions from tweets
+
+# TODO generate cooccurence matrix
+
+# TODO save cooccurence matrix
diff --git a/src/create_id_list.py b/src/create_id_list.py
new file mode 100644
index 0000000..5e9bcd7
--- /dev/null
+++ b/src/create_id_list.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+tweets_path = "../raw_data/tweets.csv"
+
+tweets = pd.read_csv(tweets_path)
+
+for id in tweets.id:
+    print(id)