Fix process

This commit is contained in:
taizan-hokouto
2020-12-05 14:42:02 +09:00
parent bc3f16e86b
commit 02d48ceccc
8 changed files with 126 additions and 61 deletions

View File

@@ -1,9 +1,13 @@
import logging # noqa import logging # noqa
from . import mylogger from . import mylogger
from base64 import a85decode as dc
headers = { headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63,gzip(gfe)',
} }
_sml = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!ICLqcS5tQB2;gCZ)?UdXC;f$GR3)MM2<(0>O7mh!,G@+K5?SO9T@okV").decode()
_smr = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!iEb03+@<k(QAU-F)8U=fDGsP557S5F7CiNH7;)D3N77^*B6YU@\\?WfBr0emZX=#^").decode()
def logger(module_name: str, loglevel=None): def logger(module_name: str, loglevel=None):
module_logger = mylogger.get_logger(module_name, loglevel=loglevel) module_logger = mylogger.get_logger(module_name, loglevel=loglevel)

View File

@@ -28,11 +28,12 @@ class Parser:
def get_contents(self, jsn): def get_contents(self, jsn):
if jsn is None: if jsn is None:
self.raise_exception(exceptions.IllegalFunctionCall('Called with none JSON object.')) self.raise_exception(exceptions.IllegalFunctionCall('Called with none JSON object.'))
if jsn['response']['responseContext'].get('errors'): if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
raise exceptions.ResponseContextError( raise exceptions.ResponseContextError(
'The video_id would be wrong, or video is deleted or private.') 'The video_id would be wrong, or video is deleted or private.')
contents = jsn['response'].get('continuationContents') contents = jsn.get('continuationContents')
return contents visitor_data = jsn.get("responseContext", {}).get("visitorData")
return contents, visitor_data
def parse(self, contents): def parse(self, contents):
""" """
@@ -85,6 +86,7 @@ class Parser:
'''Broadcasting end or cannot fetch chat stream''' '''Broadcasting end or cannot fetch chat stream'''
self.raise_exception(exceptions.NoContents('Chat data stream is empty.')) self.raise_exception(exceptions.NoContents('Chat data stream is empty.'))
cont = contents['liveChatContinuation']['continuations'][0] cont = contents['liveChatContinuation']['continuations'][0]
if cont.get("liveChatReplayContinuationData"): if cont.get("liveChatReplayContinuationData"):
# chat data exist. # chat data exist.
return None return None
@@ -97,23 +99,22 @@ class Parser:
def _create_data(self, metadata, contents): def _create_data(self, metadata, contents):
actions = contents['liveChatContinuation'].get('actions') actions = contents['liveChatContinuation'].get('actions')
if self.is_replay: if self.is_replay:
interval = self._get_interval(actions) last_offset_ms = self._get_lastoffset(actions)
metadata.setdefault("timeoutMs", interval) metadata.setdefault("timeoutMs", 5000)
metadata.setdefault("last_offset_ms", last_offset_ms)
"""Archived chat has different structures than live chat, """Archived chat has different structures than live chat,
so make it the same format.""" so make it the same format."""
chatdata = [action["replayChatItemAction"]["actions"][0] chatdata = [action["replayChatItemAction"]["actions"][0]
for action in actions] for action in actions]
else: else:
metadata.setdefault('timeoutMs', 10000) metadata.setdefault('timeoutMs', 5000)
chatdata = actions chatdata = actions
return metadata, chatdata return metadata, chatdata
def _get_interval(self, actions: list): def _get_lastoffset(self, actions: list):
if actions is None: if actions:
return 0 return int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
start = int(actions[0]["replayChatItemAction"]["videoOffsetTimeMsec"]) return 0
last = int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
return (last - start)
def raise_exception(self, exception): def raise_exception(self, exception):
if self.exception_holder is None: if self.exception_holder is None:

View File

@@ -1,5 +1,6 @@
import datetime from datetime import datetime, timedelta, timezone
import pytz
TZ_UTC = timezone(timedelta(0), 'UTC')
class BaseRenderer: class BaseRenderer:
@@ -62,13 +63,13 @@ class BaseRenderer:
if badges: if badges:
for badge in badges: for badge in badges:
author_type = badge["liveChatAuthorBadgeRenderer"]["accessibility"]["accessibilityData"]["label"] author_type = badge["liveChatAuthorBadgeRenderer"]["accessibility"]["accessibilityData"]["label"]
if author_type == '確認済み': if author_type == 'VERIFIED' or author_type == '確認済み':
isVerified = True isVerified = True
if author_type == '所有者': if author_type == 'OWNER' or author_type == '所有者':
isChatOwner = True isChatOwner = True
if 'メンバー' in author_type: if 'メンバー' in author_type or 'MEMBER' in author_type:
isChatSponsor = True isChatSponsor = True
if author_type == 'モデレーター': if author_type == 'MODERATOR' or author_type == 'モデレーター':
isChatModerator = True isChatModerator = True
return isVerified, isChatOwner, isChatSponsor, isChatModerator return isVerified, isChatOwner, isChatSponsor, isChatModerator
@@ -76,6 +77,6 @@ class BaseRenderer:
return self.renderer.get('id') return self.renderer.get('id')
def get_publishedat(self, timestamp): def get_publishedat(self, timestamp):
dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000000) dt = datetime.fromtimestamp(int(timestamp) / 1000000)
return dt.astimezone(pytz.utc).isoformat( return dt.astimezone(TZ_UTC).isoformat(
timespec='milliseconds').replace('+00:00', 'Z') timespec='milliseconds').replace('+00:00', 'Z')

View File

@@ -1,12 +1,12 @@
''' '''
YouTubeスーパーチャットで使用される通貨の記号とレート検索用の略号の Table of symbols for the currencies used in YouTube Superchat.
対応表
Key Key
YouTubeスーパーチャットで使用される通貨の記号 Currency symbols used in YouTube Super Chat
(アルファベットで終わる場合、0xA0(&npsp)が付く) If it ends with an alphabet, it will be followed by 0xA0(&npsp).
Value: Value:
fxtext: 3文字の通貨略称 fxtext: ISO 4217 currency code
jptest: 日本語テキスト jptest: japanese text
''' '''
symbols = { symbols = {
"$": {"fxtext": "USD", "jptext": "米・ドル"}, "$": {"fxtext": "USD", "jptext": "米・ドル"},

View File

@@ -1,6 +1,8 @@
import asyncio import asyncio
import httpx import httpx
import socket import socket
from concurrent.futures import CancelledError
from json import JSONDecodeError
from . import parser from . import parser
from . block import Block from . block import Block
from . worker import ExtractWorker from . worker import ExtractWorker
@@ -8,18 +10,17 @@ from . patch import Patch
from ... import config from ... import config
from ... paramgen import arcparam from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError from ... util import get_param
from json import JSONDecodeError
from urllib.parse import quote
headers = config.headers headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \ smr = config._smr
"get_live_chat_replay?continuation="
MAX_RETRY_COUNT = 3 MAX_RETRY_COUNT = 3
# Set to avoid duplicate parameters # Set to avoid duplicate parameters
param_set = set() aquired_params = set()
dat = ''
def _split(start, end, count, min_interval_sec=120): def _split(start, end, count, min_interval_sec=120):
@@ -55,28 +56,30 @@ def _split(start, end, count, min_interval_sec=120):
def ready_blocks(video_id, duration, div, callback): def ready_blocks(video_id, duration, div, callback):
param_set.clear() aquired_params.clear()
if div <= 0: if div <= 0:
raise ValueError raise ValueError
async def _get_blocks(video_id, duration, div, callback): async def _get_blocks(video_id, duration, div, callback):
async with httpx.AsyncClient(http2=True) as session: async with httpx.AsyncClient(http2=True, headers=headers) as session:
tasks = [_create_block(session, video_id, seektime, callback) tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)] for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks) return await asyncio.gather(*tasks)
async def _create_block(session, video_id, seektime, callback): async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime=seektime) continuation = arcparam.getparam(video_id, seektime=seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None err = None
last_offset = 0
global dat
for _ in range(MAX_RETRY_COUNT): for _ in range(MAX_RETRY_COUNT):
try: try:
if continuation in param_set: if continuation in aquired_params:
next_continuation, actions = None, [] next_continuation, actions = None, []
break break
param_set.add(continuation) aquired_params.add(continuation)
resp = await session.get(url, headers=headers, timeout=10) param = get_param(continuation, replay=True, offsetms=seektime * 1000, dat=dat)
next_continuation, actions = parser.parse(resp.json()) resp = await session.post(smr, json=param, timeout=10)
next_continuation, actions, last_offset, dat = parser.parse(resp.json())
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
@@ -88,15 +91,14 @@ def ready_blocks(video_id, duration, div, callback):
raise UnknownConnectionError("Abort:" + str(err)) raise UnknownConnectionError("Abort:" + str(err))
if actions: if actions:
first = parser.get_offset(actions[0]) first_offset = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
if callback: if callback:
callback(actions, last - first) callback(actions, last_offset - first_offset)
return Block( return Block(
continuation=next_continuation, continuation=next_continuation,
chat_data=actions, chat_data=actions,
first=first, first=first_offset,
last=last last=last_offset
) )
""" """
@@ -122,17 +124,19 @@ def fetch_patch(callback, blocks, video_id):
tasks = [worker.run(session) for worker in workers] tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks) return await asyncio.gather(*tasks)
async def _fetch(continuation, session) -> Patch: async def _fetch(continuation, last_offset, session=None) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" global dat
err = None err = None
for _ in range(MAX_RETRY_COUNT): for _ in range(MAX_RETRY_COUNT):
try: try:
if continuation in param_set: if continuation in aquired_params:
continuation, actions = None, [] continuation, actions = None, []
break break
param_set.add(continuation) aquired_params.add(continuation)
resp = await session.get(url, headers=config.headers) params = get_param(continuation, replay=True, offsetms=last_offset, dat=dat)
continuation, actions = parser.parse(resp.json()) # util.save(json.dumps(params, ensure_ascii=False), "v:/~~/param_"+str(last_offset), ".json")
resp = await session.post(smr, json=params)
continuation, actions, last_offset, dat = parser.parse(resp.json())
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
@@ -147,7 +151,7 @@ def fetch_patch(callback, blocks, video_id):
raise UnknownConnectionError("Abort:" + str(err)) raise UnknownConnectionError("Abort:" + str(err))
if actions: if actions:
last = parser.get_offset(actions[-1]) last = last_offset
first = parser.get_offset(actions[0]) first = parser.get_offset(actions[0])
if callback: if callback:
callback(actions, last - first) callback(actions, last - first)

View File

@@ -19,10 +19,10 @@ def parse(jsn):
""" """
if jsn is None: if jsn is None:
raise ValueError("parameter JSON is None") raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'): if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
raise exceptions.ResponseContextError( raise exceptions.ResponseContextError(
'video_id is invalid or private/deleted.') 'video_id is invalid or private/deleted.')
contents = jsn['response'].get('continuationContents') contents = jsn.get('continuationContents')
if contents is None: if contents is None:
raise exceptions.NoContents('No chat data.') raise exceptions.NoContents('No chat data.')
@@ -31,13 +31,15 @@ def parse(jsn):
raise exceptions.NoContinuation('No Continuation') raise exceptions.NoContinuation('No Continuation')
metadata = cont.get('liveChatReplayContinuationData') metadata = cont.get('liveChatReplayContinuationData')
if metadata: if metadata:
visitor_data = jsn.get("responseContext", {}).get("visitorData", '')
continuation = metadata.get("continuation") continuation = metadata.get("continuation")
actions = contents['liveChatContinuation'].get('actions') actions: list = contents['liveChatContinuation'].get('actions')
return continuation, actions last_offset: int = get_offset(actions[-1]) if actions else 0
return None, [] return continuation, actions, last_offset, visitor_data
return None, [], 0, ''
def get_offset(item): def get_offset(item) -> int:
return int(item['replayChatItemAction']["videoOffsetTimeMsec"]) return int(item['replayChatItemAction']["videoOffsetTimeMsec"])

View File

@@ -38,7 +38,7 @@ class ExtractWorker:
async def run(self, session): async def run(self, session):
while self.block.continuation: while self.block.continuation:
patch = await self.fetch( patch = await self.fetch(
self.block.continuation, session) self.block.continuation, self.block.last, session)
if patch.continuation is None: if patch.continuation is None:
"""TODO : make the worker assigned to the last block """TODO : make the worker assigned to the last block
to work more than twice as possible. to work more than twice as possible.

View File

@@ -4,9 +4,18 @@ import json
import os import os
import re import re
from .. import config from .. import config
from .. exceptions import InvalidVideoIdException
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$") PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
PATTERN_YTURL = re.compile(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)")
YT_VIDEO_ID_LENGTH = 11
CLIENT_VERSION = ''.join(("2.", (datetime.datetime.today() - datetime.timedelta(days=1)).strftime("%Y%m%d"), ".01.00"))
UA = config.headers["user-agent"]
def extract(url): def extract(url):
_session = httpx.Client(http2=True) _session = httpx.Client(http2=True)
@@ -17,8 +26,9 @@ def extract(url):
def save(data, filename, extention) -> str: def save(data, filename, extention) -> str:
save_filename = filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention save_filename = filename + "_" + \
with open(save_filename ,mode='w', encoding='utf-8') as f: (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention
with open(save_filename, mode='w', encoding='utf-8') as f:
f.writelines(data) f.writelines(data)
return save_filename return save_filename
@@ -39,3 +49,46 @@ def checkpath(filepath):
body = f'{body}({str(counter)})' body = f'{body}({str(counter)})'
newpath = os.path.join(os.path.dirname(filepath), body + extention) newpath = os.path.join(os.path.dirname(filepath), body + extention)
return newpath return newpath
def get_param(continuation, replay=False, offsetms: int = 0, dat=''):
if offsetms < 0:
offsetms = 0
ret = {
"context": {
"client": {
"visitorData": dat,
"userAgent": UA,
"clientName": "WEB",
"clientVersion": CLIENT_VERSION,
},
},
"continuation": continuation,
}
if replay:
ret.setdefault("currentPlayerState", {
"playerOffsetMs": str(int(offsetms))})
return ret
def extract_video_id(url_or_id: str) -> str:
ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH:
return url_or_id
match = re.search(PATTERN_YTURL, url_or_id)
if match is None:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
try:
ret = match.group(4)
except IndexError:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
if ret is None or len(ret) != YT_VIDEO_ID_LENGTH:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
return ret