Skip to content

Commit

Permalink
Refactor BaseParser and subclass parsers
Browse files Browse the repository at this point in the history
The BaseParser class has now been inherited by subclasses, replacing the BaseModel inheritance, thus streamlining parser implementation. This commit also includes changes to the TiktokParser to retrieve media data correctly and updates other parsers (Instagram, Youtube, Twitter, Reddit) to reflect the BaseParser inheritance change. In RedditParser, a simple main function has been added for testing purpose.

Signed-off-by: Jag_k <[email protected]>
  • Loading branch information
jag-k committed May 12, 2024
1 parent dc3eb34 commit 4005d99
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 77 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ci:

repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.4.3'
rev: 'v0.4.4'
hooks:
- id: ruff
args: [ --fix, --exit-non-zero-on-fix ]
Expand Down
8 changes: 6 additions & 2 deletions media_parser/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import aiohttp
from motor.motor_asyncio import AsyncIOMotorCollection
from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, PrivateAttr

from media_parser.database import GroupedMediaModel, MongoModelController
from media_parser.models import Media, ParserType
Expand Down Expand Up @@ -57,19 +57,23 @@ class BaseParserConfig(TypedDict):
type: Required[ParserType]


class BaseParser(ABC):
class BaseParser(ABC, BaseModel):
TYPE: ClassVar[ParserType]
_parsers: list["BaseParser"] = PrivateAttr(default_factory=list)

def __init__(self, *args, config: dict[str, dict[str, Any]] | None = None, **kwargs):
super().__init__(*args, **kwargs)
if config is None:
config = {}

if type(self) is BaseParser:
self._parsers: list[BaseParser] = [
parser(**conf)
for parser in BaseParser.__subclasses__()
if (conf := config.get(parser.TYPE.value.lower(), None)) is not None
]
else:
self._parsers = [self]

def supported(self) -> dict[ParserType, bool]:
return {parser.TYPE: parser._is_supported() for parser in self._parsers}
Expand Down
8 changes: 3 additions & 5 deletions media_parser/parsers/instagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,17 @@
from re import Match, Pattern

import aiohttp
from pydantic import BaseModel, Field
from pydantic import Field

from media_parser.models import Media, ParserType, Video

from .base import BaseParser as BaseParser
from .base import MediaCache
from media_parser.parsers.base import BaseParser, MediaCache

logger = logging.getLogger(__name__)

USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"


class InstagramParser(BaseParser, BaseModel, type=ParserType.INSTAGRAM):
class InstagramParser(BaseParser, type=ParserType.INSTAGRAM):
instagram_saas_token: str | None = Field(default=None, description="Set this for enable instagram proxy")
instagram_saas_api: str = Field(
default="https://api.lamadava.com", description="Set this to change instagram saas api"
Expand Down
23 changes: 18 additions & 5 deletions media_parser/parsers/reddit.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import asyncio
import logging
import re
from re import Match, Pattern
from urllib.parse import urlparse

import aiohttp
from aiohttp import InvalidURL
from pydantic import BaseModel, Field
from pydantic import Field

from media_parser.models import Media, ParserType, Video

from .base import BaseParser as BaseParser
from .base import MediaCache
from media_parser.parsers.base import BaseParser, MediaCache

logger = logging.getLogger(__name__)


class RedditParser(BaseParser, BaseModel, type=ParserType.REDDIT):
class RedditParser(BaseParser, type=ParserType.REDDIT):
user_agent: str | None = Field("video downloader (by u/Jag_k)", description="User agent for Reddit API")
client_id: str = Field(..., description="Client ID for Reddit API")
client_secret: str = Field(..., description="Client secret for Reddit API")
Expand Down Expand Up @@ -129,3 +128,17 @@ def id_from_url(url: str) -> str:
if not submission_id.isalnum():
raise InvalidURL(url)
return submission_id


if __name__ == "__main__":

async def main():
async with aiohttp.ClientSession() as session:
print(
await RedditParser().parse(
session,
"www.reddit.com/r/redditdev/comments/2gmzqe/praw_https/",
)
)

asyncio.run(main())
63 changes: 31 additions & 32 deletions media_parser/parsers/tiktok.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@

import aiohttp
from aiohttp import ClientSession
from pydantic import BaseModel, Field
from pydantic import Field

from media_parser.context import MAX_SIZE
from media_parser.context import get_max_size
from media_parser.models import Image, Media, ParserType, Video
from media_parser.parsers.base import BaseParser, MediaCache
from media_parser.utils import generate_timer

from .base import BaseParser as BaseParser
from .base import MediaCache

logger = logging.getLogger(__name__)

TT_USER_AGENT = (
Expand All @@ -25,7 +23,7 @@
time_it = generate_timer(logger)


class TiktokParser(BaseParser, BaseModel, type=ParserType.TIKTOK):
class TiktokParser(BaseParser, type=ParserType.TIKTOK):
user_agent: str = Field(default=TT_USER_AGENT)

def reg_exps(self):
Expand All @@ -36,7 +34,9 @@ def reg_exps(self):
# https://vm.tiktok.com/ZSRq1jcrg/
re.compile(r"(?:https?://)?(?:(?P<domain>[a-z]{2})\.)?tiktok\.com/(?P<id>\w+)/?"),
# https://www.tiktok.com/@thejoyegg/video/7136001098841591041
re.compile(r"(?:https?://)?(?:www\.)?tiktok\.com/@(?P<author>\w+)/video/(?P<video_id>\d+)/?"),
re.compile(
r"(?:https?://)?(?:www\.)?tiktok\.com/@(?P<author>\w+)/(?P<type_of>video|photo)/(?P<video_id>\d+)/?"
),
]

def _is_supported(self) -> bool:
Expand Down Expand Up @@ -85,7 +85,7 @@ async def _parse(

try:
with time_it("tiktok_get_video_data"):
data: dict = await self._get_video_data(video_id)
data: dict = await self._get_media_data(session, video_id)

except Exception as e:
logger.exception(
Expand Down Expand Up @@ -118,10 +118,7 @@ async def _parse(
def _process_video(self, data: dict, original_url: str) -> list[Video]:
max_quality_url = data.get("video_data", {}).get("nwm_video_url_HQ")

try:
max_size = float(MAX_SIZE.get("inf"))
except ValueError:
max_size = float("inf")
max_size = get_max_size()

try:
url: str | None = max(
Expand Down Expand Up @@ -198,26 +195,29 @@ async def _get_video_id(cls, url: str) -> tuple[str, int] | None:
return author, int(base)

@staticmethod
async def _get_video_data(video_id: int) -> dict:
async with ClientSession(
headers={
"Accept": "application/json",
"User-Agent": TT_USER_AGENT,
}
) as session:
async with session.get(
"https://api16-normal-c-useast1a.tiktokv.com/aweme/v1/feed/",
params={
"aweme_id": video_id,
},
) as resp:
raw_data: dict = await resp.json()
if not raw_data:
logger.error("Empty response with %r", resp.url)
return {}
async def _get_media_data(session: ClientSession, video_id: int) -> dict:
async with session.get(
"https://api16-normal-c-useast1a.tiktokv.com/aweme/v1/feed/",
params={
"iid": "7318518857994389254",
"device_id": "7318517321748022790",
"channel": "googleplay",
"app_name": "musical_ly",
"version_code": "300904",
"device_platform": "android",
"device_type": "ASUS_Z01QD",
"os_version": "9",
"aweme_id": video_id,
},
) as resp:
raw_data: dict = await resp.json()
if not raw_data:
logger.error("Empty response with %r", resp.url)
return {}
if not raw_data.get("aweme_list", []):
logger.info("No aweme_list in response")
return {}

data = raw_data["aweme_list"][0]
url_type_code = data["aweme_type"]
url_type_code_dict = {
Expand Down Expand Up @@ -269,12 +269,11 @@ async def _get_video_data(video_id: int) -> dict:
if __name__ == "__main__":

async def main():
parser = TiktokParser()
async with ClientSession() as session:
print(
await parser.parse(
await TiktokParser().parse(
session,
"https://vm.tiktok.com/ZMYQFQBQ9/",
"https://www.tiktok.com/@kastella_/photo/7364744013653527814",
)
)

Expand Down
8 changes: 3 additions & 5 deletions media_parser/parsers/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@
from re import Match

import aiohttp
from pydantic import BaseModel, Field
from pydantic import Field

from media_parser.models import Media, ParserType, Video

from .base import BaseParser as BaseParser
from .base import MediaCache
from media_parser.parsers.base import BaseParser, MediaCache

logger = logging.getLogger(__name__)

Expand All @@ -18,7 +16,7 @@
X_RE = re.compile(r"(?:https?://)?(?:www\.)?x\.com/(?P<user>\w+)/status/(?P<id>\d+)")


class TwitterParser(BaseParser, BaseModel, type=ParserType.TWITTER):
class TwitterParser(BaseParser, type=ParserType.TWITTER):
twitter_bearer_token: str = Field(..., description="Bearer token for Twitter API")

def reg_exps(self):
Expand Down
7 changes: 2 additions & 5 deletions media_parser/parsers/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@
import aiohttp
import httpx
import pytube
from pydantic import BaseModel
from pytube import StreamQuery
from pytube.exceptions import PytubeError

from media_parser.context import get_max_size
from media_parser.models import Media, ParserType, Video

from .base import BaseParser as BaseParser
from .base import MediaCache
from media_parser.parsers.base import BaseParser, MediaCache

logger = logging.getLogger(__name__)


class YoutubeParser(BaseParser, BaseModel, type=ParserType.YOUTUBE):
class YoutubeParser(BaseParser, type=ParserType.YOUTUBE):
def reg_exps(self):
return [
# https://www.youtube.com/watch?v=TCrP1SE2DkY
Expand Down
42 changes: 21 additions & 21 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "media-parser"
version = "2.0.4"
version = "2.1.0"
description = "API for parsing media from social networks"
authors = ["Jag_k <[email protected]>"]
maintainers = ["Jag_k <[email protected]>"]
Expand Down

0 comments on commit 4005d99

Please sign in to comment.