Fix process

This commit is contained in:
taizan-hokouto
2020-12-05 14:42:02 +09:00
parent bc3f16e86b
commit 02d48ceccc
8 changed files with 126 additions and 61 deletions

View File

@@ -1,9 +1,13 @@
import logging # noqa
from . import mylogger
from base64 import a85decode as dc
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 Edg/86.0.622.63,gzip(gfe)',
}
_sml = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!ICLqcS5tQB2;gCZ)?UdXC;f$GR3)MM2<(0>O7mh!,G@+K5?SO9T@okV").decode()
_smr = dc(b"BQS?8F#ks-GB\\6`H#IhIF^eo7@rH3;H#IhIF^eor06T''Ch\\'(?XmbXF>%9<FC/iuG%G#jBOQ!iEb03+@<k(QAU-F)8U=fDGsP557S5F7CiNH7;)D3N77^*B6YU@\\?WfBr0emZX=#^").decode()
def logger(module_name: str, loglevel=None):
module_logger = mylogger.get_logger(module_name, loglevel=loglevel)

View File

@@ -28,11 +28,12 @@ class Parser:
def get_contents(self, jsn):
if jsn is None:
self.raise_exception(exceptions.IllegalFunctionCall('Called with none JSON object.'))
if jsn['response']['responseContext'].get('errors'):
if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
raise exceptions.ResponseContextError(
'The video_id would be wrong, or video is deleted or private.')
contents = jsn['response'].get('continuationContents')
return contents
contents = jsn.get('continuationContents')
visitor_data = jsn.get("responseContext", {}).get("visitorData")
return contents, visitor_data
def parse(self, contents):
"""
@@ -85,6 +86,7 @@ class Parser:
'''Broadcasting end or cannot fetch chat stream'''
self.raise_exception(exceptions.NoContents('Chat data stream is empty.'))
cont = contents['liveChatContinuation']['continuations'][0]
if cont.get("liveChatReplayContinuationData"):
# chat data exist.
return None
@@ -97,23 +99,22 @@ class Parser:
def _create_data(self, metadata, contents):
actions = contents['liveChatContinuation'].get('actions')
if self.is_replay:
interval = self._get_interval(actions)
metadata.setdefault("timeoutMs", interval)
last_offset_ms = self._get_lastoffset(actions)
metadata.setdefault("timeoutMs", 5000)
metadata.setdefault("last_offset_ms", last_offset_ms)
"""Archived chat has different structures than live chat,
so make it the same format."""
chatdata = [action["replayChatItemAction"]["actions"][0]
for action in actions]
else:
metadata.setdefault('timeoutMs', 10000)
metadata.setdefault('timeoutMs', 5000)
chatdata = actions
return metadata, chatdata
def _get_interval(self, actions: list):
if actions is None:
return 0
start = int(actions[0]["replayChatItemAction"]["videoOffsetTimeMsec"])
last = int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
return (last - start)
def _get_lastoffset(self, actions: list):
if actions:
return int(actions[-1]["replayChatItemAction"]["videoOffsetTimeMsec"])
return 0
def raise_exception(self, exception):
if self.exception_holder is None:

View File

@@ -1,5 +1,6 @@
import datetime
import pytz
from datetime import datetime, timedelta, timezone
TZ_UTC = timezone(timedelta(0), 'UTC')
class BaseRenderer:
@@ -62,13 +63,13 @@ class BaseRenderer:
if badges:
for badge in badges:
author_type = badge["liveChatAuthorBadgeRenderer"]["accessibility"]["accessibilityData"]["label"]
if author_type == '確認済み':
if author_type == 'VERIFIED' or author_type == '確認済み':
isVerified = True
if author_type == '所有者':
if author_type == 'OWNER' or author_type == '所有者':
isChatOwner = True
if 'メンバー' in author_type:
if 'メンバー' in author_type or 'MEMBER' in author_type:
isChatSponsor = True
if author_type == 'モデレーター':
if author_type == 'MODERATOR' or author_type == 'モデレーター':
isChatModerator = True
return isVerified, isChatOwner, isChatSponsor, isChatModerator
@@ -76,6 +77,6 @@ class BaseRenderer:
return self.renderer.get('id')
def get_publishedat(self, timestamp):
dt = datetime.datetime.fromtimestamp(int(timestamp) / 1000000)
return dt.astimezone(pytz.utc).isoformat(
dt = datetime.fromtimestamp(int(timestamp) / 1000000)
return dt.astimezone(TZ_UTC).isoformat(
timespec='milliseconds').replace('+00:00', 'Z')

View File

@@ -1,12 +1,12 @@
'''
YouTubeスーパーチャットで使用される通貨の記号とレート検索用の略号の
対応表
Table of symbols for the currencies used in YouTube Superchat.
Key
YouTubeスーパーチャットで使用される通貨の記号
(アルファベットで終わる場合、0xA0(&npsp)が付く)
Currency symbols used in YouTube Super Chat
If it ends with an alphabet, it will be followed by 0xA0(&npsp).
Value:
fxtext: 3文字の通貨略称
jptest: 日本語テキスト
fxtext: ISO 4217 currency code
jptest: japanese text
'''
symbols = {
"$": {"fxtext": "USD", "jptext": "米・ドル"},

View File

@@ -1,6 +1,8 @@
import asyncio
import httpx
import socket
from concurrent.futures import CancelledError
from json import JSONDecodeError
from . import parser
from . block import Block
from . worker import ExtractWorker
@@ -8,18 +10,17 @@ from . patch import Patch
from ... import config
from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError
from json import JSONDecodeError
from urllib.parse import quote
from ... util import get_param
headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
smr = config._smr
MAX_RETRY_COUNT = 3
# Set to avoid duplicate parameters
param_set = set()
aquired_params = set()
dat = ''
def _split(start, end, count, min_interval_sec=120):
@@ -55,28 +56,30 @@ def _split(start, end, count, min_interval_sec=120):
def ready_blocks(video_id, duration, div, callback):
param_set.clear()
aquired_params.clear()
if div <= 0:
raise ValueError
async def _get_blocks(video_id, duration, div, callback):
async with httpx.AsyncClient(http2=True) as session:
async with httpx.AsyncClient(http2=True, headers=headers) as session:
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks)
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime=seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None
last_offset = 0
global dat
for _ in range(MAX_RETRY_COUNT):
try:
if continuation in param_set:
if continuation in aquired_params:
next_continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=headers, timeout=10)
next_continuation, actions = parser.parse(resp.json())
aquired_params.add(continuation)
param = get_param(continuation, replay=True, offsetms=seektime * 1000, dat=dat)
resp = await session.post(smr, json=param, timeout=10)
next_continuation, actions, last_offset, dat = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
@@ -88,15 +91,14 @@ def ready_blocks(video_id, duration, div, callback):
raise UnknownConnectionError("Abort:" + str(err))
if actions:
first = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
first_offset = parser.get_offset(actions[0])
if callback:
callback(actions, last - first)
callback(actions, last_offset - first_offset)
return Block(
continuation=next_continuation,
chat_data=actions,
first=first,
last=last
first=first_offset,
last=last_offset
)
"""
@@ -122,17 +124,19 @@ def fetch_patch(callback, blocks, video_id):
tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks)
async def _fetch(continuation, session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
async def _fetch(continuation, last_offset, session=None) -> Patch:
global dat
err = None
for _ in range(MAX_RETRY_COUNT):
try:
if continuation in param_set:
if continuation in aquired_params:
continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=config.headers)
continuation, actions = parser.parse(resp.json())
aquired_params.add(continuation)
params = get_param(continuation, replay=True, offsetms=last_offset, dat=dat)
# util.save(json.dumps(params, ensure_ascii=False), "v:/~~/param_"+str(last_offset), ".json")
resp = await session.post(smr, json=params)
continuation, actions, last_offset, dat = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
@@ -147,7 +151,7 @@ def fetch_patch(callback, blocks, video_id):
raise UnknownConnectionError("Abort:" + str(err))
if actions:
last = parser.get_offset(actions[-1])
last = last_offset
first = parser.get_offset(actions[0])
if callback:
callback(actions, last - first)

View File

@@ -19,10 +19,10 @@ def parse(jsn):
"""
if jsn is None:
raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'):
if jsn.get("error") or jsn.get("responseContext", {}).get("errors"):
raise exceptions.ResponseContextError(
'video_id is invalid or private/deleted.')
contents = jsn['response'].get('continuationContents')
contents = jsn.get('continuationContents')
if contents is None:
raise exceptions.NoContents('No chat data.')
@@ -31,13 +31,15 @@ def parse(jsn):
raise exceptions.NoContinuation('No Continuation')
metadata = cont.get('liveChatReplayContinuationData')
if metadata:
visitor_data = jsn.get("responseContext", {}).get("visitorData", '')
continuation = metadata.get("continuation")
actions = contents['liveChatContinuation'].get('actions')
return continuation, actions
return None, []
actions: list = contents['liveChatContinuation'].get('actions')
last_offset: int = get_offset(actions[-1]) if actions else 0
return continuation, actions, last_offset, visitor_data
return None, [], 0, ''
def get_offset(item):
def get_offset(item) -> int:
return int(item['replayChatItemAction']["videoOffsetTimeMsec"])

View File

@@ -38,7 +38,7 @@ class ExtractWorker:
async def run(self, session):
while self.block.continuation:
patch = await self.fetch(
self.block.continuation, session)
self.block.continuation, self.block.last, session)
if patch.continuation is None:
"""TODO : make the worker assigned to the last block
to work more than twice as possible.

View File

@@ -4,9 +4,18 @@ import json
import os
import re
from .. import config
from .. exceptions import InvalidVideoIdException
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
PATTERN_YTURL = re.compile(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)")
YT_VIDEO_ID_LENGTH = 11
CLIENT_VERSION = ''.join(("2.", (datetime.datetime.today() - datetime.timedelta(days=1)).strftime("%Y%m%d"), ".01.00"))
UA = config.headers["user-agent"]
def extract(url):
_session = httpx.Client(http2=True)
@@ -17,8 +26,9 @@ def extract(url):
def save(data, filename, extention) -> str:
save_filename = filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention
with open(save_filename ,mode='w', encoding='utf-8') as f:
save_filename = filename + "_" + \
(datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention
with open(save_filename, mode='w', encoding='utf-8') as f:
f.writelines(data)
return save_filename
@@ -39,3 +49,46 @@ def checkpath(filepath):
body = f'{body}({str(counter)})'
newpath = os.path.join(os.path.dirname(filepath), body + extention)
return newpath
def get_param(continuation, replay=False, offsetms: int = 0, dat=''):
if offsetms < 0:
offsetms = 0
ret = {
"context": {
"client": {
"visitorData": dat,
"userAgent": UA,
"clientName": "WEB",
"clientVersion": CLIENT_VERSION,
},
},
"continuation": continuation,
}
if replay:
ret.setdefault("currentPlayerState", {
"playerOffsetMs": str(int(offsetms))})
return ret
def extract_video_id(url_or_id: str) -> str:
ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH:
return url_or_id
match = re.search(PATTERN_YTURL, url_or_id)
if match is None:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
try:
ret = match.group(4)
except IndexError:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
if ret is None or len(ret) != YT_VIDEO_ID_LENGTH:
raise InvalidVideoIdException(f"Invalid video id: {url_or_id}")
return ret