Skip to content

Commit

Permalink
Fix: removing Photo posts & duplicates | v 0.0.28
Browse files Browse the repository at this point in the history
  • Loading branch information
Saikamesh committed Feb 25, 2024
1 parent 30be9e8 commit 645d8ce
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 36 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,8 @@
- Change: Individual csv files are now saved in a subfolder inside Data/root. Merged file will stay in Data/root Folder.

[Version 0.0.27] - 2024-02-12
- Added: Documentation for structuring query object
- Added: Documentation for structuring query object

[Version 0.0.28] - 2024-02-25
- Fix: Fixed bug causing the photo posts to be returned in the query results.
- Fix: Merged data will no longer contain duplicates.
23 changes: 22 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,28 @@
certifi==2023.11.17
charset-normalizer==3.3.2
docutils==0.20.1
idna==3.6
importlib-metadata==7.0.1
jaraco.classes==3.3.0
keyring==24.3.0
markdown-it-py==3.0.0
mdurl==0.1.2
more-itertools==10.2.0
nh3==0.2.15
numpy==1.26.4
pandas==2.1.4
pkginfo==1.9.6
Pygments==2.17.2
python-dateutil==2.8.2
pytz==2024.1
pywin32-ctypes==0.2.2
readme-renderer==42.0
requests==2.31.0
requests-toolbelt==1.0.0
rfc3986==2.0.0
rich==13.7.0
six==1.16.0
tzdata==2023.3
twine==4.0.2
tzdata==2024.1
urllib3==2.1.0
zipp==3.17.0
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@

setup(
name="tt_crawl",
version="0.0.27",
description="A TikTok crawler",
version="0.0.28",
description="A Python package for interacting with TikTok Research API",
long_description=open("README.md").read() + "\n\n" + open("CHANGELOG.txt").read(),
long_description_content_type="text/markdown",
url="https://github.com/Saikamesh/Mindful-Tiktok",
author="Sai Dwibhashyam",
license="GPL-3.0 License",
classifiers=classifiers,
keywords="TikTok, TikTok Research API, TikTok Data",
keywords="TikTok, TikTok Research API, TikTok API, TikTok Data",
packages=find_packages(),
python_requires=">=3.10",
install_requires=["requests==2.31.0", ],
install_requires=["requests==2.31.0", "pandas==2.1.4"],
)
37 changes: 37 additions & 0 deletions tt_crawl/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import requests
import pandas as pd


def validate_urls(response_json: dict) -> dict:
"""
Removes all the photo posts from the data.
Args:
response_json (dict): The response object from the TikTok API.
"""

EMBED_URL = "https://www.tiktok.com/embed/"

for video in response_json["data"]["videos"]:
url = EMBED_URL + str(video["id"])
res = requests.get(url)

if not res.ok:
response_json["data"]["videos"].remove(video)

return response_json


def remove_duplicate_rows(file) -> None:
"""
Removes duplicate rows from a CSV file.
Args:
file (string): The path of the file with name.
"""
df = pd.read_csv(file)
duplicate_data = df[df.id.duplicated()]
if duplicate_data.empty:
return
df.drop_duplicates(subset="id", keep="first", inplace=True)
df.to_csv(file, index=False)
48 changes: 32 additions & 16 deletions tt_crawl/tt_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import re
from typing import Union
from . import utils as ut
from . import validation as vl
from . import helper as hl


class TikTokCrawler:
OAUTH_URL = "https://open.tiktokapis.com/v2/oauth/token/"
Expand Down Expand Up @@ -104,7 +105,7 @@ def _process_request(
return err
else:
response_json = response.json()
res_json = vl.validate_urls(response_json)
res_json = hl.validate_urls(response_json)
res_json["search_key"] = search_key
res_json["queried_date"] = queried_date
return res_json
Expand Down Expand Up @@ -146,7 +147,7 @@ def query_videos(
response_list.append(response_json)
return response_list
else:
req = ut.generate_request_query(query, start_date, end_date)
req = ut.generate_request_query(query, start_date, end_date)
response_json = self._process_request(req, search_key, queried_date)
return response_json

Expand All @@ -157,9 +158,9 @@ def make_csv(
Makes a csv file from given data.
Args:
data (Union[dict, list]): The data to be converted to csv.
file_name (str, optional): The name of the csv file. Defaults to a search key based on query.
data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to current working dir.
data (Union[dict, list]): The data to be converted to csv. This is usually the response from the query_video method.
file_name (str, optional): The name of the csv file. Defaults to a search key with date-time based on query.
data_dir (str, optional): The directory in which the csv file is to be stored. Defaults to /Data/video_data in current working dir.
"""
fields = self.FIELDS.split(",") + ["search_key", "queried_date"]

Expand Down Expand Up @@ -190,22 +191,35 @@ def make_csv(

def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None:
"""
Merges all the csv files in the Data folder.
Merges multiple csv files into one file.
Args:
data_dir (str, optional): The path to the directory from which the csv files are to be read.
Defaults to folder '/Data/video_data' in current working dir.
The merged file is stored in the same directory as the data_dir. Defaults to /Data in current working dir.
file_name (str, optional): The name of the merged csv file. Defaults to 'video_list.csv'.
Note: If the file_name already exists, the data is appended to the existing file.
It is recommended to use a new file name everytime you want to create a new mergefile.
"""
if not data_dir:
data_dir = os.path.join(os.getcwd(), "Data", "video_data")
# file_name = (f"video_list_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" )

if not file_name:
file_name = "video_list.csv"
# file_name = (
# f"merged_data_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
# )

if not data_dir:
data_dir = os.path.join(os.getcwd(), "Data", "video_data")
file_path = os.path.join(os.getcwd(), "Data", file_name)
else:
file_path = os.path.join(data_dir, file_name)
file_path = file_path.replace("\\", "/")

all_files = glob.glob(os.path.join(data_dir, "*.csv"))
file_path = os.path.join(os.getcwd(), "Data", file_name)
print(all_files)

with open(
os.path.join(file_path), "a", newline="", encoding="utf-8"
) as fout:
with open(os.path.join(file_path), "a", newline="", encoding="utf-8") as fout:
writer = csv.writer(fout)
header_saved = False
for filename in all_files:
Expand All @@ -217,3 +231,5 @@ def merge_all_data(self, data_dir: str = None, file_name: str = None) -> None:
header_saved = True
for row in reader:
writer.writerow(row)

hl.remove_duplicate_rows(file_path)
14 changes: 13 additions & 1 deletion tt_crawl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
def check_date_range(start_date: str, end_date: str) -> bool:
"""
Returns True if the date range is valid.
Args:
start_date (str): The start date of the search.
end_date (str): The end date of the search.
"""
start_datetime = datetime.datetime.strptime(start_date, "%Y%m%d")
end_datetime = datetime.datetime.strptime(end_date, "%Y%m%d")
Expand All @@ -18,7 +22,12 @@ def check_date_range(start_date: str, end_date: str) -> bool:

def generate_request_query(query: dict, start_date: str, end_date: str) -> dict:
"""
Returns a dictionary of the request query.
Returns a request query object.
Args:
query (dict): The search query.
start_date (str): The start date of the search.
end_date (str): The end date of the search.
"""
request_query = {
"query": query.get("query"),
Expand Down Expand Up @@ -85,6 +94,9 @@ def process_data(
data["music_id"] = "'" + str(data["music_id"]) + "'"
data["search_key"] = search_key
data["queried_date"] = queried_date
data["create_time"] = datetime.datetime.utcfromtimestamp(
data["create_time"]
).strftime("%Y-%m-%d %H:%M:%S")

with open(file_path, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields)
Expand Down
13 changes: 0 additions & 13 deletions tt_crawl/validation.py

This file was deleted.

0 comments on commit 645d8ce

Please sign in to comment.