From bc3f16e86b5c65591b81666c62e238c1d25b47bf Mon Sep 17 00:00:00 2001 From: taizan-hokouto <55448286+taizan-hokuto@users.noreply.github.com> Date: Sat, 5 Dec 2020 14:39:55 +0900 Subject: [PATCH] Move functions --- pytchat/cli/cli_extractor.py | 5 ++- pytchat/cli/echo.py | 2 +- pytchat/core/__init__.py | 2 +- pytchat/core/pytchat.py | 40 +++++++++++---------- pytchat/core_async/livechat.py | 48 ++++++++++++------------- pytchat/core_multithread/livechat.py | 53 ++++++++++++++-------------- pytchat/tool/extract/extractor.py | 4 +-- pytchat/tool/videoinfo.py | 2 +- 8 files changed, 80 insertions(+), 76 deletions(-) diff --git a/pytchat/cli/cli_extractor.py b/pytchat/cli/cli_extractor.py index 53053a2..4274798 100644 --- a/pytchat/cli/cli_extractor.py +++ b/pytchat/cli/cli_extractor.py @@ -12,7 +12,6 @@ from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchErr from .. processors.html_archiver import HTMLArchiver from .. tool.extract.extractor import Extractor from .. tool.videoinfo import VideoInfo -from .. util.extract_video_id import extract_video_id class CLIExtractor: @@ -25,7 +24,7 @@ class CLIExtractor: print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}") try: - video_id = extract_video_id(video_id) + video_id = util.extract_video_id(video_id) separated_path = str(Path(Arguments().output)) + os.path.sep path = util.checkpath(separated_path + video_id + '.html') try: @@ -118,4 +117,4 @@ def clear_tasks(): except Exception as e: print(str(e)) if Arguments().debug: - traceback.print_exc() \ No newline at end of file + traceback.print_exc() diff --git a/pytchat/cli/echo.py b/pytchat/cli/echo.py index 95876ab..0c1b56a 100644 --- a/pytchat/cli/echo.py +++ b/pytchat/cli/echo.py @@ -1,6 +1,6 @@ import pytchat from ..exceptions import ChatDataFinished, NoContents -from ..util.extract_video_id import extract_video_id +from ..util import extract_video_id class Echo: diff --git a/pytchat/core/__init__.py b/pytchat/core/__init__.py index 9b98c18..644d281 100644 --- a/pytchat/core/__init__.py +++ b/pytchat/core/__init__.py @@ -1,5 +1,5 @@ from .pytchat import PytchatCore -from .. util.extract_video_id import extract_video_id +from .. util import extract_video_id def create(video_id: str, **kwargs): diff --git a/pytchat/core/pytchat.py b/pytchat/core/pytchat.py index 004a3a5..361057d 100644 --- a/pytchat/core/pytchat.py +++ b/pytchat/core/pytchat.py @@ -3,14 +3,13 @@ import json import signal import time import traceback -import urllib.parse from ..parser.live import Parser from .. import config from .. import exceptions from ..paramgen import liveparam, arcparam from ..processors.default.processor import DefaultProcessor from ..processors.combinator import Combinator -from ..util.extract_video_id import extract_video_id +from .. import util headers = config.headers MAX_RETRY = 10 @@ -52,8 +51,6 @@ class PytchatCore: Flag to stop getting chat. ''' - _setup_finished = False - def __init__(self, video_id, seektime=-1, processor=DefaultProcessor(), @@ -63,7 +60,7 @@ class PytchatCore: hold_exception=True, logger=config.logger(__name__), ): - self._video_id = extract_video_id(video_id) + self._video_id = util.extract_video_id(video_id) self.seektime = seektime if isinstance(processor, tuple): self.processor = Combinator(processor) @@ -78,8 +75,10 @@ class PytchatCore: exception_holder=self._exception_holder ) self._first_fetch = True - self._fetch_url = "live_chat/get_live_chat?continuation=" + self._fetch_url = config._sml self._topchat_only = topchat_only + self._dat = '' + self._last_offset_ms = 0 self._logger = logger if interruptable: signal.signal(signal.SIGINT, lambda a, b: self.terminate()) @@ -91,7 +90,7 @@ class PytchatCore: create and start _listen loop. """ self.continuation = liveparam.getparam(self._video_id, 3) - + def _get_chat_component(self): ''' Fetch chat data and store them into buffer, @@ -114,6 +113,7 @@ class PytchatCore: "chatdata": chatdata } self.continuation = metadata.get('continuation') + self._last_offset_ms = metadata.get('last_offset_ms', 0) return chat_component except exceptions.ChatParseException as e: self._logger.debug(f"[{self._video_id}]{str(e)}") @@ -132,39 +132,43 @@ class PytchatCore: 'continuationContents' which includes metadata & chat data. ''' livechat_json = ( - self._get_livechat_json(continuation, client, headers) + self._get_livechat_json(continuation, client, replay=self._is_replay, offset_ms=self._last_offset_ms) ) - contents = self._parser.get_contents(livechat_json) + contents, dat = self._parser.get_contents(livechat_json) + if self._dat == '' and dat: + self._dat = dat if self._first_fetch: if contents is None or self._is_replay: '''Try to fetch archive chat data.''' self._parser.is_replay = True - self._fetch_url = "live_chat_replay/get_live_chat_replay?continuation=" + self._fetch_url = config._smr continuation = arcparam.getparam( self._video_id, self.seektime, self._topchat_only) - livechat_json = (self._get_livechat_json(continuation, client, headers)) + livechat_json = (self._get_livechat_json(continuation, client, replay=True, offset_ms=self.seektime * 1000)) reload_continuation = self._parser.reload_continuation( - self._parser.get_contents(livechat_json)) + self._parser.get_contents(livechat_json)[0]) if reload_continuation: livechat_json = (self._get_livechat_json( reload_continuation, client, headers)) - contents = self._parser.get_contents(livechat_json) + contents, _ = self._parser.get_contents(livechat_json) self._is_replay = True self._first_fetch = False return contents - def _get_livechat_json(self, continuation, client, headers): + def _get_livechat_json(self, continuation, client, replay: bool, offset_ms: int = 0): ''' Get json which includes chat data. ''' - continuation = urllib.parse.quote(continuation) livechat_json = None err = None - url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1" + if offset_ms < 0: + offset_ms = 0 + param = util.get_param(continuation, dat=self._dat, replay=replay, offsetms=offset_ms) for _ in range(MAX_RETRY + 1): - with client: + with httpx.Client(http2=True) as client: try: - livechat_json = client.get(url, headers=headers).json() + response = client.post(self._fetch_url, json=param) + livechat_json = json.loads(response.text, encoding='utf-8') break except (json.JSONDecodeError, httpx.ConnectTimeout, httpx.ReadTimeout, httpx.ConnectError) as e: err = e diff --git a/pytchat/core_async/livechat.py b/pytchat/core_async/livechat.py index 1c0231e..b0d9b97 100644 --- a/pytchat/core_async/livechat.py +++ b/pytchat/core_async/livechat.py @@ -5,17 +5,16 @@ import json import signal import time import traceback -import urllib.parse from asyncio import Queue from concurrent.futures import CancelledError from .buffer import Buffer from ..parser.live import Parser from .. import config from .. import exceptions +from .. import util from ..paramgen import liveparam, arcparam from ..processors.default.processor import DefaultProcessor from ..processors.combinator import Combinator -from ..util.extract_video_id import extract_video_id headers = config.headers MAX_RETRY = 10 @@ -84,7 +83,7 @@ class LiveChatAsync: topchat_only=False, logger=config.logger(__name__), ): - self._video_id = extract_video_id(video_id) + self._video_id = util.extract_video_id(video_id) self.seektime = seektime if isinstance(processor, tuple): self.processor = Combinator(processor) @@ -101,8 +100,10 @@ class LiveChatAsync: self._pauser = Queue() self._pauser.put_nowait(None) self._first_fetch = True - self._fetch_url = "live_chat/get_live_chat?continuation=" + self._fetch_url = config._sml self._topchat_only = topchat_only + self._dat = '' + self._last_offset_ms = 0 self._logger = logger self.exception = None LiveChatAsync._logger = logger @@ -160,10 +161,8 @@ class LiveChatAsync: async with httpx.AsyncClient(http2=True) as client: while(continuation and self._is_alive): continuation = await self._check_pause(continuation) - contents = await self._get_contents( - continuation, client, headers) + contents = await self._get_contents(continuation, client, headers) metadata, chatdata = self._parser.parse(contents) - timeout = metadata['timeoutMs'] / 1000 chat_component = { "video_id": self._video_id, @@ -183,16 +182,16 @@ class LiveChatAsync: diff_time = timeout - (time.time() - time_mark) await asyncio.sleep(diff_time) continuation = metadata.get('continuation') + self._last_offset_ms = metadata.get('last_offset_ms', 0) except exceptions.ChatParseException as e: self._logger.debug(f"[{self._video_id}]{str(e)}") raise except Exception: - self._logger.error(f"{traceback.format_exc(limit = -1)}") + self._logger.error(f"{traceback.format_exc(limit=-1)}") raise self._logger.debug(f"[{self._video_id}] finished fetching chat.") - async def _check_pause(self, continuation): if self._pauser.empty(): '''pause''' @@ -215,46 +214,50 @@ class LiveChatAsync: ------- 'continuationContents' which includes metadata & chatdata. ''' - livechat_json = await self._get_livechat_json(continuation, client, headers) - contents = self._parser.get_contents(livechat_json) + livechat_json = await self._get_livechat_json(continuation, client, replay=self._is_replay, offset_ms=self._last_offset_ms) + contents, dat = self._parser.get_contents(livechat_json) + if self._dat == '' and dat: + self._dat = dat if self._first_fetch: if contents is None or self._is_replay: '''Try to fetch archive chat data.''' self._parser.is_replay = True - self._fetch_url = "live_chat_replay/get_live_chat_replay?continuation=" + self._fetch_url = config._smr continuation = arcparam.getparam( self._video_id, self.seektime, self._topchat_only) livechat_json = (await self._get_livechat_json( - continuation, client, headers)) + continuation, client, replay=True, offset_ms=self.seektime * 1000)) reload_continuation = self._parser.reload_continuation( - self._parser.get_contents(livechat_json)) + self._parser.get_contents(livechat_json)[0]) if reload_continuation: livechat_json = (await self._get_livechat_json( reload_continuation, client, headers)) - contents = self._parser.get_contents(livechat_json) + contents, _ = self._parser.get_contents(livechat_json) self._is_replay = True self._first_fetch = False return contents - async def _get_livechat_json(self, continuation, client, headers): + async def _get_livechat_json(self, continuation, client, replay: bool, offset_ms: int = 0): ''' Get json which includes chat data. ''' - continuation = urllib.parse.quote(continuation) + # continuation = urllib.parse.quote(continuation) livechat_json = None - url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1" + if offset_ms < 0: + offset_ms = 0 + param = util.get_param(continuation, dat=self._dat, replay=replay, offsetms=offset_ms) for _ in range(MAX_RETRY + 1): try: - resp = await client.get(url, headers=headers) + resp = await client.post(self._fetch_url, json=param) livechat_json = resp.json() break except (json.JSONDecodeError, httpx.HTTPError): - await asyncio.sleep(1) + await asyncio.sleep(2) continue else: self._logger.error(f"[{self._video_id}]" f"Exceeded retry count.") - return None + raise exceptions.RetryExceedMaxCount() return livechat_json async def _callback_loop(self, callback): @@ -330,9 +333,6 @@ class LiveChatAsync: self.terminate() def _task_finished(self): - ''' - Terminate fetching chats. - ''' if self.is_alive(): self.terminate() try: diff --git a/pytchat/core_multithread/livechat.py b/pytchat/core_multithread/livechat.py index 3eb5184..883c2aa 100644 --- a/pytchat/core_multithread/livechat.py +++ b/pytchat/core_multithread/livechat.py @@ -3,7 +3,6 @@ import json import signal import time import traceback -import urllib.parse from concurrent.futures import CancelledError, ThreadPoolExecutor from queue import Queue from threading import Event @@ -11,10 +10,10 @@ from .buffer import Buffer from ..parser.live import Parser from .. import config from .. import exceptions +from .. import util from ..paramgen import liveparam, arcparam from ..processors.default.processor import DefaultProcessor from ..processors.combinator import Combinator -from ..util.extract_video_id import extract_video_id headers = config.headers MAX_RETRY = 10 @@ -84,7 +83,7 @@ class LiveChat: topchat_only=False, logger=config.logger(__name__) ): - self._video_id = extract_video_id(video_id) + self._video_id = util.extract_video_id(video_id) self.seektime = seektime if isinstance(processor, tuple): self.processor = Combinator(processor) @@ -101,8 +100,10 @@ class LiveChat: self._pauser = Queue() self._pauser.put_nowait(None) self._first_fetch = True - self._fetch_url = "live_chat/get_live_chat?continuation=" + self._fetch_url = config._sml self._topchat_only = topchat_only + self._dat = '' + self._last_offset_ms = 0 self._event = Event() self._logger = logger self.exception = None @@ -176,6 +177,7 @@ class LiveChat: diff_time = timeout - (time.time() - time_mark) self._event.wait(diff_time if diff_time > 0 else 0) continuation = metadata.get('continuation') + self._last_offset_ms = metadata.get('last_offset_ms', 0) except exceptions.ChatParseException as e: self._logger.debug(f"[{self._video_id}]{str(e)}") raise @@ -185,7 +187,6 @@ class LiveChat: self._logger.debug(f"[{self._video_id}] finished fetching chat.") - def _check_pause(self, continuation): if self._pauser.empty(): '''pause''' @@ -207,43 +208,46 @@ class LiveChat: ------- 'continuationContents' which includes metadata & chat data. ''' - livechat_json = ( - self._get_livechat_json(continuation, client, headers) - ) - contents = self._parser.get_contents(livechat_json) + livechat_json = self._get_livechat_json(continuation, client, headers) + contents, dat = self._parser.get_contents(livechat_json) + if self._dat == '' and dat: + self._dat = dat if self._first_fetch: if contents is None or self._is_replay: '''Try to fetch archive chat data.''' self._parser.is_replay = True - self._fetch_url = "live_chat_replay/get_live_chat_replay?continuation=" + self._fetch_url = config._smr continuation = arcparam.getparam( self._video_id, self.seektime, self._topchat_only) - livechat_json = (self._get_livechat_json(continuation, client, headers)) + livechat_json = (self._get_livechat_json( + continuation, client, replay=True, offset_ms=self.seektime * 1000)) reload_continuation = self._parser.reload_continuation( - self._parser.get_contents(livechat_json)) + self._parser.get_contents(livechat_json)[0]) if reload_continuation: livechat_json = (self._get_livechat_json( reload_continuation, client, headers)) - contents = self._parser.get_contents(livechat_json) + contents, _ = self._parser.get_contents(livechat_json) self._is_replay = True self._first_fetch = False return contents - def _get_livechat_json(self, continuation, client, headers): + def _get_livechat_json(self, continuation, client, replay: bool, offset_ms: int = 0): ''' Get json which includes chat data. ''' - continuation = urllib.parse.quote(continuation) + # continuation = urllib.parse.quote(continuation) livechat_json = None - url = f"https://www.youtube.com/{self._fetch_url}{continuation}&pbj=1" + if offset_ms < 0: + offset_ms = 0 + param = util.get_param(continuation, dat=self._dat, replay=replay, offsetms=offset_ms) for _ in range(MAX_RETRY + 1): - with client: - try: - livechat_json = client.get(url, headers=headers).json() - break - except (json.JSONDecodeError, httpx.HTTPError): - time.sleep(2) - continue + try: + resp = client.post(self._fetch_url, json=param) + livechat_json = resp.json() + break + except (json.JSONDecodeError, httpx.HTTPError): + time.sleep(2) + continue else: self._logger.error(f"[{self._video_id}]" f"Exceeded retry count.") @@ -312,9 +316,6 @@ class LiveChat: self._logger.debug(f'[{self._video_id}] cancelled:{sender}') def terminate(self): - ''' - Terminate fetching chats. - ''' if self._pauser.empty(): self._pauser.put_nowait(None) self._is_alive = False diff --git a/pytchat/tool/extract/extractor.py b/pytchat/tool/extract/extractor.py index f7d492b..f722132 100644 --- a/pytchat/tool/extract/extractor.py +++ b/pytchat/tool/extract/extractor.py @@ -4,7 +4,7 @@ from . import duplcheck from .. videoinfo import VideoInfo from ... import config from ... exceptions import InvalidVideoIdException -from ... util.extract_video_id import extract_video_id +from ... import util logger = config.logger(__name__) headers = config.headers @@ -16,7 +16,7 @@ class Extractor: raise ValueError('div must be positive integer.') elif div > 10: div = 10 - self.video_id = extract_video_id(video_id) + self.video_id = util.extract_video_id(video_id) self.div = div self.callback = callback self.processor = processor diff --git a/pytchat/tool/videoinfo.py b/pytchat/tool/videoinfo.py index 1744de7..9f8b972 100644 --- a/pytchat/tool/videoinfo.py +++ b/pytchat/tool/videoinfo.py @@ -4,7 +4,7 @@ import re import time from .. import config from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError -from ..util.extract_video_id import extract_video_id +from ..util import extract_video_id headers = config.headers