Fix process

2020-12-05 14:42:02 +09:00
parent bc3f16e86b
commit 02d48ceccc
8 changed files with 126 additions and 61 deletions
--- a/pytchat/config/init.py
+++ b/pytchat/config/init.py
@@ -1,9 +1,13 @@
 import logging  # noqa
 from . import mylogger
+from base64 import a85decode as dc
 headers = {
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63,gzip(gfe)',
 }

+_sml = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!ICLqcS5tQB2;gCZ)?UdXC;f$GR3)MM2<(0>O7mh!,G@+K5?SO9T@okV").decode()
+_smr = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!iEb03+@<k(QAU-F)8U=fDGsP557S5F7CiNH7;)D3N77^*B6YU@\\?WfBr0emZX=#^").decode()
+

 def logger(module_name: str, loglevel=None):
    module_logger = mylogger.get_logger(module_name, loglevel=loglevel)
--- a/pytchat/parser/live.py
+++ b/pytchat/parser/live.py
@@ -28,11 +28,12 @@ class Parser:
    def get_contents(self, jsn):
        if jsn is None:
            self.raise_exception(exceptions.IllegalFunctionCall('Called with none JSON object.'))
-        if jsn['response']['responseContext'].get('errors'):
+        if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
            raise exceptions.ResponseContextError(
                'The video_id would be wrong, or video is deleted or private.')
-        contents = jsn['response'].get('continuationContents')
-        return contents
+        contents = jsn.get('continuationContents')
+        visitor_data = jsn.get("responseContext", {}).get("visitorData")
+        return contents, visitor_data

    def parse(self, contents):
        """
@@ -85,6 +86,7 @@ class Parser:
            '''Broadcasting end or cannot fetch chat stream'''
            self.raise_exception(exceptions.NoContents('Chat data stream is empty.'))
        cont = contents['liveChatContinuation']['continuations'][0]
+
        if cont.get("liveChatReplayContinuationData"):
            # chat data exist.
            return None
@@ -97,23 +99,22 @@ class Parser:
    def _create_data(self, metadata, contents):
        actions = contents['liveChatContinuation'].get('actions')
        if self.is_replay:
-            interval = self._get_interval(actions)
-            metadata.setdefault("timeoutMs", interval)
+            last_offset_ms = self._get_lastoffset(actions)
+            metadata.setdefault("timeoutMs", 5000)
+            metadata.setdefault("last_offset_ms", last_offset_ms)
            """Archived chat has different structures than live chat,
            so make it the same format."""
            chatdata = [action["replayChatItemAction"]["actions"][0]
                        for action in actions]
        else:
-            metadata.setdefault('timeoutMs', 10000)
+            metadata.setdefault('timeoutMs', 5000)
            chatdata = actions
        return metadata, chatdata

-    def _get_interval(self, actions: list):
-        if actions is None:
-            return 0
-        start = int(actions[0]["replayChatItemAction"]["videoOffsetTimeMsec"])
-        last = int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
-        return (last - start)
+    def _get_lastoffset(self, actions: list):
+        if actions:
+            return int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
+        return 0

    def raise_exception(self, exception):
        if self.exception_holder is None:
--- a/pytchat/processors/compatible/renderer/base.py
+++ b/pytchat/processors/compatible/renderer/base.py
@@ -1,5 +1,6 @@
-import datetime
-import pytz
+from datetime import datetime, timedelta, timezone
+
+TZ_UTC = timezone(timedelta(0), 'UTC')


 class BaseRenderer:
@@ -62,13 +63,13 @@ class BaseRenderer:
        if badges:
            for badge in badges:
                author_type = badge["liveChatAuthorBadgeRenderer"]["accessibility"]["accessibilityData"]["label"]
-                if author_type == '確認済み':
+                if author_type == 'VERIFIED' or author_type == '確認済み':
                    isVerified = True
-                if author_type == '所有者':
+                if author_type == 'OWNER' or author_type == '所有者':
                    isChatOwner = True
-                if 'メンバー' in author_type:
+                if 'メンバー' in author_type or 'MEMBER' in author_type:
                    isChatSponsor = True
-                if author_type == 'モデレーター':
+                if author_type == 'MODERATOR' or author_type == 'モデレーター':
                    isChatModerator = True
        return isVerified, isChatOwner, isChatSponsor, isChatModerator

@@ -76,6 +77,6 @@ class BaseRenderer:
        return self.renderer.get('id')

    def get_publishedat(self, timestamp):
-        dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000000)
-        return dt.astimezone(pytz.utc).isoformat(
+        dt = datetime.fromtimestamp(int(timestamp) / 1000000)
+        return dt.astimezone(TZ_UTC).isoformat(
            timespec='milliseconds').replace('+00:00', 'Z')
--- a/pytchat/processors/compatible/renderer/currency.py
+++ b/pytchat/processors/compatible/renderer/currency.py
@@ -1,12 +1,12 @@
 '''
-YouTubeスーパーチャットで使用される通貨の記号とレート検索用の略号の
-対応表
+Table of symbols for the currencies used in YouTube Superchat.
+
 Key：
-    YouTubeスーパーチャットで使用される通貨の記号
-    （アルファベットで終わる場合、0xA0(&npsp)が付く）
+     Currency symbols used in YouTube Super Chat
+     If it ends with an alphabet, it will be followed by 0xA0(&npsp).
 Value:
-    fxtext: 3文字の通貨略称
-    jptest: 日本語テキスト
+    fxtext: ISO 4217 currency code
+    jptest: japanese text
 '''
 symbols = {
    "$": {"fxtext": "USD", "jptext": "米・ドル"},
--- a/pytchat/tool/extract/asyncdl.py
+++ b/pytchat/tool/extract/asyncdl.py
@@ -1,6 +1,8 @@
 import asyncio
 import httpx
 import socket
+from concurrent.futures import CancelledError
+from json import JSONDecodeError
 from . import parser
 from . block import Block
 from . worker import ExtractWorker
@@ -8,18 +10,17 @@ from . patch import Patch
 from ... import config
 from ... paramgen import arcparam
 from ... exceptions import UnknownConnectionError
-from concurrent.futures import CancelledError
-from json import JSONDecodeError
-from urllib.parse import quote
+from ... util import get_param


 headers = config.headers
-REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
-             "get_live_chat_replay?continuation="
+smr = config._smr
+
 MAX_RETRY_COUNT = 3

 # Set to avoid duplicate parameters
-param_set = set()
+aquired_params = set()
+dat = ''


 def _split(start, end, count, min_interval_sec=120):
@@ -55,28 +56,30 @@ def _split(start, end, count, min_interval_sec=120):


 def ready_blocks(video_id, duration, div, callback):
-    param_set.clear()
+    aquired_params.clear()
    if div <= 0:
        raise ValueError

    async def _get_blocks(video_id, duration, div, callback):
-        async with httpx.AsyncClient(http2=True) as session:
+        async with httpx.AsyncClient(http2=True, headers=headers) as session:
            tasks = [_create_block(session, video_id, seektime, callback)
                     for seektime in _split(-1, duration, div)]
            return await asyncio.gather(*tasks)

    async def _create_block(session, video_id, seektime, callback):
        continuation = arcparam.getparam(video_id, seektime=seektime)
-        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
        err = None
+        last_offset = 0
+        global dat
        for _ in range(MAX_RETRY_COUNT):
            try:
-                if continuation in param_set:
+                if continuation in aquired_params:
                    next_continuation, actions = None, []
                    break
-                param_set.add(continuation)
-                resp = await session.get(url, headers=headers, timeout=10)
-                next_continuation, actions = parser.parse(resp.json())
+                aquired_params.add(continuation)
+                param = get_param(continuation, replay=True, offsetms=seektime * 1000, dat=dat)
+                resp = await session.post(smr, json=param, timeout=10)
+                next_continuation, actions, last_offset, dat = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
@@ -88,15 +91,14 @@ def ready_blocks(video_id, duration, div, callback):
            raise UnknownConnectionError("Abort:" + str(err))

        if actions:
-            first = parser.get_offset(actions[0])
-            last = parser.get_offset(actions[-1])
+            first_offset = parser.get_offset(actions[0])
            if callback:
-                callback(actions, last - first)
+                callback(actions, last_offset - first_offset)
            return Block(
                continuation=next_continuation,
                chat_data=actions,
-                first=first,
-                last=last
+                first=first_offset,
+                last=last_offset
            )

    """
@@ -122,17 +124,19 @@ def fetch_patch(callback, blocks, video_id):
            tasks = [worker.run(session) for worker in workers]
            return await asyncio.gather(*tasks)

-    async def _fetch(continuation, session) -> Patch:
-        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
+    async def _fetch(continuation, last_offset, session=None) -> Patch:
+        global dat
        err = None
        for _ in range(MAX_RETRY_COUNT):
            try:
-                if continuation in param_set:
+                if continuation in aquired_params:
                    continuation, actions = None, []
                    break
-                param_set.add(continuation)
-                resp = await session.get(url, headers=config.headers)
-                continuation, actions = parser.parse(resp.json())
+                aquired_params.add(continuation)
+                params = get_param(continuation, replay=True, offsetms=last_offset, dat=dat)
+                # util.save(json.dumps(params, ensure_ascii=False), "v:/~~/param_"+str(last_offset), ".json")
+                resp = await session.post(smr, json=params)
+                continuation, actions, last_offset, dat = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
@@ -147,7 +151,7 @@ def fetch_patch(callback, blocks, video_id):
            raise UnknownConnectionError("Abort:" + str(err))

        if actions:
-            last = parser.get_offset(actions[-1])
+            last = last_offset
            first = parser.get_offset(actions[0])
            if callback:
                callback(actions, last - first)
--- a/pytchat/tool/extract/parser.py
+++ b/pytchat/tool/extract/parser.py
@@ -19,10 +19,10 @@ def parse(jsn):
    """
    if jsn is None:
        raise ValueError("parameter JSON is None")
-    if jsn['response']['responseContext'].get('errors'):
+    if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
        raise exceptions.ResponseContextError(
            'video_id is invalid or private/deleted.')
-    contents = jsn['response'].get('continuationContents')
+    contents = jsn.get('continuationContents')
    if contents is None:
        raise exceptions.NoContents('No chat data.')

@@ -31,13 +31,15 @@ def parse(jsn):
        raise exceptions.NoContinuation('No Continuation')
    metadata = cont.get('liveChatReplayContinuationData')
    if metadata:
+        visitor_data = jsn.get("responseContext", {}).get("visitorData", '')
        continuation = metadata.get("continuation")
-        actions = contents['liveChatContinuation'].get('actions')
-        return continuation, actions
-    return None, []
+        actions: list = contents['liveChatContinuation'].get('actions')
+        last_offset: int = get_offset(actions[-1]) if actions else 0
+        return continuation, actions, last_offset, visitor_data
+    return None, [], 0, ''


-def get_offset(item):
+def get_offset(item) -> int:
    return int(item['replayChatItemAction']["videoOffsetTimeMsec"])


--- a/pytchat/tool/extract/worker.py
+++ b/pytchat/tool/extract/worker.py
@@ -38,7 +38,7 @@ class ExtractWorker:
    async def run(self, session):
        while self.block.continuation:
            patch = await self.fetch(
-                self.block.continuation, session)
+                self.block.continuation, self.block.last, session)
            if patch.continuation is None:
                """TODO : make the worker assigned to the last block
                to work more than twice as possible.
--- a/pytchat/util/init.py
+++ b/pytchat/util/init.py
@@ -4,9 +4,18 @@ import json
 import os
 import re
 from .. import config
+from .. exceptions import InvalidVideoIdException

 PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")

+PATTERN_YTURL = re.compile(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)")
+
+YT_VIDEO_ID_LENGTH = 11
+
+CLIENT_VERSION = ''.join(("2.", (datetime.datetime.today() - datetime.timedelta(days=1)).strftime("%Y%m%d"), ".01.00"))
+
+UA = config.headers["user-agent"]
+

 def extract(url):
    _session = httpx.Client(http2=True)
@@ -17,8 +26,9 @@ def extract(url):


 def save(data, filename, extention) -> str:
-    save_filename = filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention
-    with open(save_filename ,mode='w', encoding='utf-8') as f:
+    save_filename = filename + "_" + \
+        (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention
+    with open(save_filename, mode='w', encoding='utf-8') as f:
        f.writelines(data)
    return save_filename

@@ -39,3 +49,46 @@ def checkpath(filepath):
            body = f'{body}({str(counter)})'
        newpath = os.path.join(os.path.dirname(filepath), body + extention)
    return newpath
+
+
+def get_param(continuation, replay=False, offsetms: int = 0, dat=''):
+    if offsetms < 0:
+        offsetms = 0
+    ret = {
+        "context": {
+            "client": {
+                "visitorData": dat,
+                "userAgent": UA,
+                "clientName": "WEB",
+                "clientVersion": CLIENT_VERSION,
+            },
+
+        },
+        "continuation": continuation,
+    }
+    if replay:
+        ret.setdefault("currentPlayerState", {
+                       "playerOffsetMs": str(int(offsetms))})
+    return ret
+
+
+def extract_video_id(url_or_id: str) -> str:
+    ret = ''
+    if '[' in url_or_id:
+        url_or_id = url_or_id.replace('[', '').replace(']', '')
+
+    if type(url_or_id) != str:
+        raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
+    if len(url_or_id) == YT_VIDEO_ID_LENGTH:
+        return url_or_id
+    match = re.search(PATTERN_YTURL, url_or_id)
+    if match is None:
+        raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
+    try:
+        ret = match.group(4)
+    except IndexError:
+        raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
+
+    if ret is None or len(ret) != YT_VIDEO_ID_LENGTH:
+        raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
+    return ret