Fix: removing Photo posts & duplicates | v 0.0.28

itsmetheearthianbuoy · Feb 25, 2024 · 645d8ce · 645d8ce
1 parent 30be9e8
commit 645d8ce
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 36 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -34,4 +34,8 @@
 - Change: Individual csv files are now saved in a subfolder inside Data/root. Merged file will stay in Data/root Folder. 
 
 [Version 0.0.27] - 2024-02-12
-- Added: Documentation for structuring query object
+- Added: Documentation for structuring query object
+
+[Version 0.0.28] - 2024-02-25
+- Fix: Fixed bug causing the photo posts to be returned in the query results.
+- Fix: Merged data will no longer contain duplicates.
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,28 @@
 certifi==2023.11.17
 charset-normalizer==3.3.2
+docutils==0.20.1
 idna==3.6
+importlib-metadata==7.0.1
+jaraco.classes==3.3.0
+keyring==24.3.0
+markdown-it-py==3.0.0
+mdurl==0.1.2
+more-itertools==10.2.0
+nh3==0.2.15
+numpy==1.26.4
+pandas==2.1.4
+pkginfo==1.9.6
+Pygments==2.17.2
+python-dateutil==2.8.2
+pytz==2024.1
+pywin32-ctypes==0.2.2
+readme-renderer==42.0
 requests==2.31.0
+requests-toolbelt==1.0.0
+rfc3986==2.0.0
+rich==13.7.0
 six==1.16.0
-tzdata==2023.3
+twine==4.0.2
+tzdata==2024.1
 urllib3==2.1.0
+zipp==3.17.0
diff --git a/setup.py b/setup.py
@@ -11,16 +11,16 @@
 
 setup(
     name="tt_crawl",
-    version="0.0.27",
-    description="A TikTok crawler",
+    version="0.0.28",
+    description="A Python package for interacting with TikTok Research API",
     long_description=open("README.md").read() + "\n\n" + open("CHANGELOG.txt").read(),
     long_description_content_type="text/markdown",
     url="https://github.com/Saikamesh/Mindful-Tiktok",
     author="Sai Dwibhashyam",
     license="GPL-3.0 License",
     classifiers=classifiers,
-    keywords="TikTok, TikTok Research API, TikTok Data",
+    keywords="TikTok, TikTok Research API, TikTok API, TikTok Data",
     packages=find_packages(),
     python_requires=">=3.10",
-    install_requires=["requests==2.31.0", ],
+    install_requires=["requests==2.31.0", "pandas==2.1.4"],
 )
diff --git a/tt_crawl/helper.py b/tt_crawl/helper.py
@@ -0,0 +1,37 @@
+import requests
+import pandas as pd
+
+
+def validate_urls(response_json: dict) -> dict:
+    """
+    Removes all the photo posts from the data.
+
+    Args:
+        response_json (dict): The response object from the TikTok API.
+    """
+
+    EMBED_URL = "https://www.tiktok.com/embed/"
+
+    for video in response_json["data"]["videos"]:
+        url = EMBED_URL + str(video["id"])
+        res = requests.get(url)
+
+        if not res.ok:
+            response_json["data"]["videos"].remove(video)
+
+    return response_json
+
+
+def remove_duplicate_rows(file) -> None:
+    """
+    Removes duplicate rows from a CSV file.
+
+    Args:
+        file (string): The path of the file with name.
+    """
+    df = pd.read_csv(file)
+    duplicate_data = df[df.id.duplicated()]
+    if duplicate_data.empty:
+        return
+    df.drop_duplicates(subset="id", keep="first", inplace=True)
+    df.to_csv(file, index=False)
diff --git a/tt_crawl/tt_crawler.py b/tt_crawl/tt_crawler.py
@@ -7,7 +7,8 @@
 import re
 from typing import Union
 from . import utils as ut
-from . import validation as vl
+from . import helper as hl
+
 
 class TikTokCrawler:
     OAUTH_URL = "https://open.tiktokapis.com/v2/oauth/token/"
@@ -104,7 +105,7 @@ def _process_request(
             return err
         else:
             response_json = response.json()
-            res_json = vl.validate_urls(response_json)
+            res_json = hl.validate_urls(response_json)
             res_json["search_key"] = search_key
             res_json["queried_date"] = queried_date
             return res_json
@@ -146,7 +147,7 @@ def query_videos(
                 response_list.append(response_json)
             return response_list
         else:
-            req = ut.generate_request_query(query, start_date, end_date)
+            req = ut.generate_request_query(query, start_date, end_date) 
             response_json = self._process_request(req, search_key, queried_date)
             return response_json
 
@@ -157,9 +158,9 @@ def make_csv(
         Makes a csv file from given data.
 
         Args:
-            data (Union[dict, list]): The data to be converted to csv.
-            file_name (str, optional): The name of the csv file. Defaults to a search key based on query.
-            data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to current working dir.
+            data (Union[dict, list]): The data to be converted to csv. This is usually the response from the query_video method.
+            file_name (str, optional): The name of the csv file. Defaults to a search key with date-time based on query.
+            data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to /Data/video_data in current working dir.
         """
         fields = self.FIELDS.split(",") + ["search_key", "queried_date"]
 
@@ -190,22 +191,35 @@ def make_csv(
 
     def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None:
         """
-        Merges all the csv files in the Data folder.
+        Merges multiple csv files into one file.
+
+        Args:
+            data_dir (str, optional): The path to the directory from which the csv files are to be read.
+            Defaults to folder '/Data/video_data' in current working dir.
+
+            The merged file is stored in the same directory as the data_dir. Defaults to /Data in current working dir.
+
+            file_name (str, optional): The name of the merged csv file. Defaults to 'video_list.csv'.
+
+            Note: If the file_name already exists, the data is appended to the existing file.
+            It is recommended to use a new file name everytime you want to create a new mergefile.
         """
-        if not data_dir:
-            data_dir = os.path.join(os.getcwd(), "Data", "video_data")
+        # file_name = (f"video_list_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" )
+
         if not file_name:
             file_name = "video_list.csv"
-            # file_name = (
-            #     f"merged_data_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
-            # )
+
+        if not data_dir:
+            data_dir = os.path.join(os.getcwd(), "Data", "video_data")
+            file_path = os.path.join(os.getcwd(), "Data", file_name)
+        else:
+            file_path = os.path.join(data_dir, file_name)
+            file_path = file_path.replace("\\", "/")
 
         all_files = glob.glob(os.path.join(data_dir, "*.csv"))
-        file_path = os.path.join(os.getcwd(), "Data", file_name)
+        print(all_files)
 
-        with open(
-            os.path.join(file_path), "a", newline="", encoding="utf-8"
-        ) as fout:
+        with open(os.path.join(file_path), "a", newline="", encoding="utf-8") as fout:
             writer = csv.writer(fout)
             header_saved = False
             for filename in all_files:
@@ -217,3 +231,5 @@ def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None:
                         header_saved = True
                     for row in reader:
                         writer.writerow(row)
+
+        hl.remove_duplicate_rows(file_path)
diff --git a/tt_crawl/utils.py b/tt_crawl/utils.py
@@ -5,6 +5,10 @@
 def check_date_range(start_date: str, end_date: str) -> bool:
     """
     Returns True if the date range is valid.
+
+    Args:
+        start_date (str): The start date of the search.
+        end_date (str): The end date of the search.
     """
     start_datetime = datetime.datetime.strptime(start_date, "%Y%m%d")
     end_datetime = datetime.datetime.strptime(end_date, "%Y%m%d")
@@ -18,7 +22,12 @@ def check_date_range(start_date: str, end_date: str) -> bool:
 
 def generate_request_query(query: dict, start_date: str, end_date: str) -> dict:
     """
-    Returns a dictionary of the request query.
+    Returns a request query object.
+
+    Args:
+        query (dict): The search query.
+        start_date (str): The start date of the search.
+        end_date (str): The end date of the search.
     """
     request_query = {
         "query": query.get("query"),
@@ -85,6 +94,9 @@ def process_data(
             data["music_id"] = "'" + str(data["music_id"]) + "'"
             data["search_key"] = search_key
             data["queried_date"] = queried_date
+            data["create_time"] = datetime.datetime.utcfromtimestamp(
+                data["create_time"]
+            ).strftime("%Y-%m-%d %H:%M:%S")
 
         with open(file_path, "a", newline="", encoding="utf-8") as f:
             writer = csv.DictWriter(f, fieldnames=fields)

diff --git a/tt_crawl/validation.py b/tt_crawl/validation.py