diff --git a/CHANGELOG.txt b/CHANGELOG.txt index ab9ed2a..5ebc4f7 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -34,4 +34,8 @@ - Change: Individual csv files are now saved in a subfolder inside Data/root. Merged file will stay in Data/root Folder. [Version 0.0.27] - 2024-02-12 -- Added: Documentation for structuring query object \ No newline at end of file +- Added: Documentation for structuring query object + +[Version 0.0.28] - 2024-02-25 +- Fix: Fixed bug causing the photo posts to be returned in the query results. +- Fix: Merged data will no longer contain duplicates. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 0894d21..5040f79 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,28 @@ certifi==2023.11.17 charset-normalizer==3.3.2 +docutils==0.20.1 idna==3.6 +importlib-metadata==7.0.1 +jaraco.classes==3.3.0 +keyring==24.3.0 +markdown-it-py==3.0.0 +mdurl==0.1.2 +more-itertools==10.2.0 +nh3==0.2.15 +numpy==1.26.4 +pandas==2.1.4 +pkginfo==1.9.6 +Pygments==2.17.2 +python-dateutil==2.8.2 +pytz==2024.1 +pywin32-ctypes==0.2.2 +readme-renderer==42.0 requests==2.31.0 +requests-toolbelt==1.0.0 +rfc3986==2.0.0 +rich==13.7.0 six==1.16.0 -tzdata==2023.3 +twine==4.0.2 +tzdata==2024.1 urllib3==2.1.0 +zipp==3.17.0 diff --git a/setup.py b/setup.py index bb0f0dc..7a62fad 100644 --- a/setup.py +++ b/setup.py @@ -11,16 +11,16 @@ setup( name="tt_crawl", - version="0.0.27", - description="A TikTok crawler", + version="0.0.28", + description="A Python package for interacting with TikTok Research API", long_description=open("README.md").read() + "\n\n" + open("CHANGELOG.txt").read(), long_description_content_type="text/markdown", url="https://github.com/Saikamesh/Mindful-Tiktok", author="Sai Dwibhashyam", license="GPL-3.0 License", classifiers=classifiers, - keywords="TikTok, TikTok Research API, TikTok Data", + keywords="TikTok, TikTok Research API, TikTok API, TikTok Data", packages=find_packages(), python_requires=">=3.10", - install_requires=["requests==2.31.0", ], + install_requires=["requests==2.31.0", "pandas==2.1.4"], ) diff --git a/tt_crawl/helper.py b/tt_crawl/helper.py new file mode 100644 index 0000000..d60213f --- /dev/null +++ b/tt_crawl/helper.py @@ -0,0 +1,37 @@ +import requests +import pandas as pd + + +def validate_urls(response_json: dict) -> dict: + """ + Removes all the photo posts from the data. + + Args: + response_json (dict): The response object from the TikTok API. + """ + + EMBED_URL = "https://www.tiktok.com/embed/" + + for video in response_json["data"]["videos"]: + url = EMBED_URL + str(video["id"]) + res = requests.get(url) + + if not res.ok: + response_json["data"]["videos"].remove(video) + + return response_json + + +def remove_duplicate_rows(file) -> None: + """ + Removes duplicate rows from a CSV file. + + Args: + file (string): The path of the file with name. + """ + df = pd.read_csv(file) + duplicate_data = df[df.id.duplicated()] + if duplicate_data.empty: + return + df.drop_duplicates(subset="id", keep="first", inplace=True) + df.to_csv(file, index=False) diff --git a/tt_crawl/tt_crawler.py b/tt_crawl/tt_crawler.py index a31998f..5abfb71 100644 --- a/tt_crawl/tt_crawler.py +++ b/tt_crawl/tt_crawler.py @@ -7,7 +7,8 @@ import re from typing import Union from . import utils as ut -from . import validation as vl +from . import helper as hl + class TikTokCrawler: OAUTH_URL = "https://open.tiktokapis.com/v2/oauth/token/" @@ -104,7 +105,7 @@ def _process_request( return err else: response_json = response.json() - res_json = vl.validate_urls(response_json) + res_json = hl.validate_urls(response_json) res_json["search_key"] = search_key res_json["queried_date"] = queried_date return res_json @@ -146,7 +147,7 @@ def query_videos( response_list.append(response_json) return response_list else: - req = ut.generate_request_query(query, start_date, end_date) + req = ut.generate_request_query(query, start_date, end_date) response_json = self._process_request(req, search_key, queried_date) return response_json @@ -157,9 +158,9 @@ def make_csv( Makes a csv file from given data. Args: - data (Union[dict, list]): The data to be converted to csv. - file_name (str, optional): The name of the csv file. Defaults to a search key based on query. - data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to current working dir. + data (Union[dict, list]): The data to be converted to csv. This is usually the response from the query_video method. + file_name (str, optional): The name of the csv file. Defaults to a search key with date-time based on query. + data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to /Data/video_data in current working dir. """ fields = self.FIELDS.split(",") + ["search_key", "queried_date"] @@ -190,22 +191,35 @@ def make_csv( def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None: """ - Merges all the csv files in the Data folder. + Merges multiple csv files into one file. + + Args: + data_dir (str, optional): The path to the directory from which the csv files are to be read. + Defaults to folder '/Data/video_data' in current working dir. + + The merged file is stored in the same directory as the data_dir. Defaults to /Data in current working dir. + + file_name (str, optional): The name of the merged csv file. Defaults to 'video_list.csv'. + + Note: If the file_name already exists, the data is appended to the existing file. + It is recommended to use a new file name everytime you want to create a new mergefile. """ - if not data_dir: - data_dir = os.path.join(os.getcwd(), "Data", "video_data") + # file_name = (f"video_list_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" ) + if not file_name: file_name = "video_list.csv" - # file_name = ( - # f"merged_data_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" - # ) + + if not data_dir: + data_dir = os.path.join(os.getcwd(), "Data", "video_data") + file_path = os.path.join(os.getcwd(), "Data", file_name) + else: + file_path = os.path.join(data_dir, file_name) + file_path = file_path.replace("\\", "/") all_files = glob.glob(os.path.join(data_dir, "*.csv")) - file_path = os.path.join(os.getcwd(), "Data", file_name) + print(all_files) - with open( - os.path.join(file_path), "a", newline="", encoding="utf-8" - ) as fout: + with open(os.path.join(file_path), "a", newline="", encoding="utf-8") as fout: writer = csv.writer(fout) header_saved = False for filename in all_files: @@ -217,3 +231,5 @@ def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None: header_saved = True for row in reader: writer.writerow(row) + + hl.remove_duplicate_rows(file_path) diff --git a/tt_crawl/utils.py b/tt_crawl/utils.py index e7a8f1c..4a61ee1 100644 --- a/tt_crawl/utils.py +++ b/tt_crawl/utils.py @@ -5,6 +5,10 @@ def check_date_range(start_date: str, end_date: str) -> bool: """ Returns True if the date range is valid. + + Args: + start_date (str): The start date of the search. + end_date (str): The end date of the search. """ start_datetime = datetime.datetime.strptime(start_date, "%Y%m%d") end_datetime = datetime.datetime.strptime(end_date, "%Y%m%d") @@ -18,7 +22,12 @@ def check_date_range(start_date: str, end_date: str) -> bool: def generate_request_query(query: dict, start_date: str, end_date: str) -> dict: """ - Returns a dictionary of the request query. + Returns a request query object. + + Args: + query (dict): The search query. + start_date (str): The start date of the search. + end_date (str): The end date of the search. """ request_query = { "query": query.get("query"), @@ -85,6 +94,9 @@ def process_data( data["music_id"] = "'" + str(data["music_id"]) + "'" data["search_key"] = search_key data["queried_date"] = queried_date + data["create_time"] = datetime.datetime.utcfromtimestamp( + data["create_time"] + ).strftime("%Y-%m-%d %H:%M:%S") with open(file_path, "a", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fields) diff --git a/tt_crawl/validation.py b/tt_crawl/validation.py deleted file mode 100644 index 52bab97..0000000 --- a/tt_crawl/validation.py +++ /dev/null @@ -1,13 +0,0 @@ -import requests - - -def validate_urls(response_json: dict) -> dict: - EMBED_URL = "https://www.tiktok.com/embed/" - - for video in response_json["data"]["videos"]: - url = EMBED_URL + str(video["id"]) - res = requests.get(url) - if res.status_code == 400: - response_json["data"]["videos"].remove(video) - - return response_json