From 02d48cecccf251d8f961f0d6b3743fe774ccfcfb Mon Sep 17 00:00:00 2001 From: taizan-hokouto <55448286+taizan-hokuto@users.noreply.github.com> Date: Sat, 5 Dec 2020 14:42:02 +0900 Subject: [PATCH] Fix process --- pytchat/config/__init__.py | 6 +- pytchat/parser/live.py | 25 ++++---- .../processors/compatible/renderer/base.py | 17 +++--- .../compatible/renderer/currency.py | 12 ++-- pytchat/tool/extract/asyncdl.py | 54 ++++++++++-------- pytchat/tool/extract/parser.py | 14 +++-- pytchat/tool/extract/worker.py | 2 +- pytchat/util/__init__.py | 57 ++++++++++++++++++- 8 files changed, 126 insertions(+), 61 deletions(-) diff --git a/pytchat/config/__init__.py b/pytchat/config/__init__.py index e362819..4f26a1e 100644 --- a/pytchat/config/__init__.py +++ b/pytchat/config/__init__.py @@ -1,9 +1,13 @@ import logging # noqa from . import mylogger +from base64 import a85decode as dc headers = { - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63,gzip(gfe)', } +_sml = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9O7mh!,G@+K5?SO9T@okV").decode() +_smr = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9 Patch: - url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" + async def _fetch(continuation, last_offset, session=None) -> Patch: + global dat err = None for _ in range(MAX_RETRY_COUNT): try: - if continuation in param_set: + if continuation in aquired_params: continuation, actions = None, [] break - param_set.add(continuation) - resp = await session.get(url, headers=config.headers) - continuation, actions = parser.parse(resp.json()) + aquired_params.add(continuation) + params = get_param(continuation, replay=True, offsetms=last_offset, dat=dat) + # util.save(json.dumps(params, ensure_ascii=False), "v:/~~/param_"+str(last_offset), ".json") + resp = await session.post(smr, json=params) + continuation, actions, last_offset, dat = parser.parse(resp.json()) break except JSONDecodeError: await asyncio.sleep(3) @@ -147,7 +151,7 @@ def fetch_patch(callback, blocks, video_id): raise UnknownConnectionError("Abort:" + str(err)) if actions: - last = parser.get_offset(actions[-1]) + last = last_offset first = parser.get_offset(actions[0]) if callback: callback(actions, last - first) diff --git a/pytchat/tool/extract/parser.py b/pytchat/tool/extract/parser.py index 2866af2..d9b2cc8 100644 --- a/pytchat/tool/extract/parser.py +++ b/pytchat/tool/extract/parser.py @@ -19,10 +19,10 @@ def parse(jsn): """ if jsn is None: raise ValueError("parameter JSON is None") - if jsn['response']['responseContext'].get('errors'): + if jsn.get("error") or jsn.get("responseContext", {}).get("errors"): raise exceptions.ResponseContextError( 'video_id is invalid or private/deleted.') - contents = jsn['response'].get('continuationContents') + contents = jsn.get('continuationContents') if contents is None: raise exceptions.NoContents('No chat data.') @@ -31,13 +31,15 @@ def parse(jsn): raise exceptions.NoContinuation('No Continuation') metadata = cont.get('liveChatReplayContinuationData') if metadata: + visitor_data = jsn.get("responseContext", {}).get("visitorData", '') continuation = metadata.get("continuation") - actions = contents['liveChatContinuation'].get('actions') - return continuation, actions - return None, [] + actions: list = contents['liveChatContinuation'].get('actions') + last_offset: int = get_offset(actions[-1]) if actions else 0 + return continuation, actions, last_offset, visitor_data + return None, [], 0, '' -def get_offset(item): +def get_offset(item) -> int: return int(item['replayChatItemAction']["videoOffsetTimeMsec"]) diff --git a/pytchat/tool/extract/worker.py b/pytchat/tool/extract/worker.py index 261de10..5216451 100644 --- a/pytchat/tool/extract/worker.py +++ b/pytchat/tool/extract/worker.py @@ -38,7 +38,7 @@ class ExtractWorker: async def run(self, session): while self.block.continuation: patch = await self.fetch( - self.block.continuation, session) + self.block.continuation, self.block.last, session) if patch.continuation is None: """TODO : make the worker assigned to the last block to work more than twice as possible. diff --git a/pytchat/util/__init__.py b/pytchat/util/__init__.py index acb984b..66cdd96 100644 --- a/pytchat/util/__init__.py +++ b/pytchat/util/__init__.py @@ -4,9 +4,18 @@ import json import os import re from .. import config +from .. exceptions import InvalidVideoIdException PATTERN = re.compile(r"(.*)\(([0-9]+)\)$") +PATTERN_YTURL = re.compile(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)") + +YT_VIDEO_ID_LENGTH = 11 + +CLIENT_VERSION = ''.join(("2.", (datetime.datetime.today() - datetime.timedelta(days=1)).strftime("%Y%m%d"), ".01.00")) + +UA = config.headers["user-agent"] + def extract(url): _session = httpx.Client(http2=True) @@ -17,8 +26,9 @@ def extract(url): def save(data, filename, extention) -> str: - save_filename = filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention - with open(save_filename ,mode='w', encoding='utf-8') as f: + save_filename = filename + "_" + \ + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention + with open(save_filename, mode='w', encoding='utf-8') as f: f.writelines(data) return save_filename @@ -39,3 +49,46 @@ def checkpath(filepath): body = f'{body}({str(counter)})' newpath = os.path.join(os.path.dirname(filepath), body + extention) return newpath + + +def get_param(continuation, replay=False, offsetms: int = 0, dat=''): + if offsetms < 0: + offsetms = 0 + ret = { + "context": { + "client": { + "visitorData": dat, + "userAgent": UA, + "clientName": "WEB", + "clientVersion": CLIENT_VERSION, + }, + + }, + "continuation": continuation, + } + if replay: + ret.setdefault("currentPlayerState", { + "playerOffsetMs": str(int(offsetms))}) + return ret + + +def extract_video_id(url_or_id: str) -> str: + ret = '' + if '[' in url_or_id: + url_or_id = url_or_id.replace('[', '').replace(']', '') + + if type(url_or_id) != str: + raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.") + if len(url_or_id) == YT_VIDEO_ID_LENGTH: + return url_or_id + match = re.search(PATTERN_YTURL, url_or_id) + if match is None: + raise InvalidVideoIdException(f"Invalid video id: {url_or_id}") + try: + ret = match.group(4) + except IndexError: + raise InvalidVideoIdException(f"Invalid video id: {url_or_id}") + + if ret is None or len(ret) != YT_VIDEO_ID_LENGTH: + raise InvalidVideoIdException(f"Invalid video id: {url_or_id}") + return ret