diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 3638b2e..ab9ed2a 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -24,4 +24,14 @@ [Version 0.0.24] - 2024-01-21 - Fix: Video Id & Music Id will no longer be effected by excel file conversion. - Added: CSV file will now contain a column with the search_key & date on which the query was made. -- Code refractoring & structural changes. \ No newline at end of file +- Code refractoring & structural changes. + +[Version 0.0.25] - 2024-01-27 +- Change: create_time now shows standard time format instead of unix timestamp. + +[Version 0.0.26] - 2024-02-02 +- Added: Tiktok posts containing images are no longer returned. +- Change: Individual csv files are now saved in a subfolder inside Data/root. Merged file will stay in Data/root Folder. + +[Version 0.0.27] - 2024-02-12 +- Added: Documentation for structuring query object \ No newline at end of file diff --git a/README.md b/README.md index 3227895..986100c 100644 --- a/README.md +++ b/README.md @@ -18,10 +18,10 @@ pip install -i https://test.pypi.org/simple/ tt-crawl # Instructions -To learn how to construct your own query, use the [tiktok documentation](https://developers.tiktok.com/doc/research-api-specs-query-videos/) +To learn how to construct your own query, see [here](./howToQuery.md) or refer to the [tiktok documentation](https://developers.tiktok.com/doc/research-api-specs-query-videos/) -Perform a query +Performing a query ``` diff --git a/howToQuery.md b/howToQuery.md new file mode 100644 index 0000000..7f30e36 --- /dev/null +++ b/howToQuery.md @@ -0,0 +1,126 @@ +## Tiktok research API + +How to structure a request body? + +Below is the example of a request body. + +```json +request_body = { + "query": { + "and": [ + {"operation": "IN", "field_name": "region_code", "field_values": ["JP", "US"]}, + {"operation": "EQ", "field_name": "hashtag_name", "field_values": ["Valorant"]}, + ], + "or": [ + {"operation": "EQ", "field_name": "video_length", "field_values": ["MID"]}, + {"operation": "EQ", "field_name": "video_length", "field_values": ["LONG"]}, + ], + "not": [ + {"operation": "EQ", "field_name": "video_length", "field_values": ["SHORT"]} + ], + } +} +``` + +The request body is a query object which is used to get the information from the api. + +Every Query object has 3 children `and`, `or` , `not` . Each of which is a list of conditions. + +The `and` conditions specify that all the conditions in the list must be met + +The `or` conditions specify that at least one of the conditions in the list must be met + +The `not` conditions specify that none of the conditions in the list must be met + +A valid query must contain at least one non-empty `and`, `or` or `not` condition lists. + +A condition is an object that specifies the field name, the operation, and the field values to restrict the query. + +### operation: + +--- + +**Possible values**: "EQ", "IN", "GT", "GTE", "LT", "LTE” + +**Value Descriptions:** + +**EQ**: equal to + +**IN**: in + +**GT**: greater than + +**LT**: less than + +**GTE**: greater than or equal to + +**LTE**: less than or equal to + +### field_name & field_value: + +--- + +Depending on the field_name you’ve choose, use the corresponding field_values which are related to the field name. + +Refer to the below table + +| Field Name | Description | Example | +| ------------ | --------------------------------------------------------------------------------- | ------------------------------------ | +| create_date | The video creation date in UTC, presented in the format YYYYMMDD | 20220910 | +| username | The username of the video creator | "cookie_love_122" | +| region_code | A two digit code for the country where the video creator registered their account | ‘US’,’UK’,’IN’,’JP’ … | +| video_id | The unique identifier of the video | 6978662169214864645 | +| hashtag_name | The hashtag associated with the video | "arianagrande", "celebrity" | +| keyword | The keyword in the video description | "tiktok" | +| music_id | The music ID of the video. | 8978345345214861235 | +| effect_id | The effect ID of the video. | 3957392342148643476 | +| video_length | The duration of the video | "SHORT", "MID", "LONG", "EXTRA_LONG" | + +`SHORT: <15s, +MID: 15s~1min, +LONG: 1~5min, +EXTRA_LONG: >5min` + +--- + +### Example + +Below is an example on how to write query object based on the requirements + +Let’s say, you want to get data about videos which contain the hashtag #nfl from region US, And you do not want any videos which are less than 15s. Below is how you write the query for it + +```json +request_body = { + "query": { + "and": [ + {"operation": "IN", "field_name": "region_code","field_values": ["US"]}, + {"operation": "EQ", "field_name": "hashtag_name", "field_values": ["nfl"]}, + ], + "or": [], + "not": [ + {"operation": "EQ", "field_name": "video_length", "field_values": ["SHORT"]} + ], + } +} +``` + +We have left the `or` condition empty as there is no need for it. + +consider for a moment that we need to get the data from either US or Japan, below is how we’d structure our query object, + +```json +request_body = { + "query": { + "and": [ + {"operation": "EQ", "field_name": "hashtag_name", "field_values": ["nfl"]}, + ], + "or": [ + {"operation": "IN", "field_name": "region_code","field_values": ["US"]}, + {"operation": "IN", "field_name": "region_code","field_values": ["JP"]} + ], + "not": [ + {"operation": "EQ", "field_name": "video_length", "field_values": ["SHORT"]} + ], + } +} +``` \ No newline at end of file diff --git a/setup.py b/setup.py index 5335832..bb0f0dc 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name="tt_crawl", - version="0.0.24", + version="0.0.27", description="A TikTok crawler", long_description=open("README.md").read() + "\n\n" + open("CHANGELOG.txt").read(), long_description_content_type="text/markdown", @@ -21,6 +21,6 @@ classifiers=classifiers, keywords="TikTok, TikTok Research API, TikTok Data", packages=find_packages(), - install_requires=["requests==2.31.0", ], python_requires=">=3.10", + install_requires=["requests==2.31.0", ], ) diff --git a/tt_crawl/tt_crawler.py b/tt_crawl/tt_crawler.py index fdc736d..a31998f 100644 --- a/tt_crawl/tt_crawler.py +++ b/tt_crawl/tt_crawler.py @@ -7,7 +7,7 @@ import re from typing import Union from . import utils as ut - +from . import validation as vl class TikTokCrawler: OAUTH_URL = "https://open.tiktokapis.com/v2/oauth/token/" @@ -104,10 +104,10 @@ def _process_request( return err else: response_json = response.json() - response_json["search_key"] = search_key - response_json["queried_date"] = queried_date - return response_json - pass + res_json = vl.validate_urls(response_json) + res_json["search_key"] = search_key + res_json["queried_date"] = queried_date + return res_json def query_videos( self, @@ -164,7 +164,7 @@ def make_csv( fields = self.FIELDS.split(",") + ["search_key", "queried_date"] if not data_dir: - data_dir = os.path.join(os.getcwd(), "Data") + data_dir = os.path.join(os.getcwd(), "Data", "video_data") os.makedirs(data_dir, exist_ok=True) if not isinstance(data, list): @@ -193,16 +193,18 @@ def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None: Merges all the csv files in the Data folder. """ if not data_dir: - data_dir = os.path.join(os.getcwd(), "Data") + data_dir = os.path.join(os.getcwd(), "Data", "video_data") if not file_name: - file_name = ( - f"merged_data_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" - ) + file_name = "video_list.csv" + # file_name = ( + # f"merged_data_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" + # ) all_files = glob.glob(os.path.join(data_dir, "*.csv")) + file_path = os.path.join(os.getcwd(), "Data", file_name) with open( - os.path.join(data_dir, file_name), "w", newline="", encoding="utf-8" + os.path.join(file_path), "a", newline="", encoding="utf-8" ) as fout: writer = csv.writer(fout) header_saved = False diff --git a/tt_crawl/validation.py b/tt_crawl/validation.py new file mode 100644 index 0000000..52bab97 --- /dev/null +++ b/tt_crawl/validation.py @@ -0,0 +1,13 @@ +import requests + + +def validate_urls(response_json: dict) -> dict: + EMBED_URL = "https://www.tiktok.com/embed/" + + for video in response_json["data"]["videos"]: + url = EMBED_URL + str(video["id"]) + res = requests.get(url) + if res.status_code == 400: + response_json["data"]["videos"].remove(video) + + return response_json