Format code

This commit is contained in:
taizan-hokuto
2020-06-04 23:10:26 +09:00
parent e6dbc8772e
commit 2474207691
50 changed files with 635 additions and 622 deletions

View File

@@ -5,7 +5,7 @@ from . import parser
from . block import Block
from . worker import ExtractWorker
from . patch import Patch
from ... import config
from ... import config
from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError
@@ -17,10 +17,11 @@ REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
MAX_RETRY_COUNT = 3
def _split(start, end, count, min_interval_sec = 120):
def _split(start, end, count, min_interval_sec=120):
"""
Split section from `start` to `end` into `count` pieces,
and returns the beginning of each piece.
and returns the beginning of each piece.
The `count` is adjusted so that the length of each piece
is no smaller than `min_interval`.
@@ -28,41 +29,43 @@ def _split(start, end, count, min_interval_sec = 120):
--------
List of the offset of each block's first chat data.
"""
if not (isinstance(start,int) or isinstance(start,float)) or \
not (isinstance(end,int) or isinstance(end,float)):
if not (isinstance(start, int) or isinstance(start, float)) or \
not (isinstance(end, int) or isinstance(end, float)):
raise ValueError("start/end must be int or float")
if not isinstance(count,int):
if not isinstance(count, int):
raise ValueError("count must be int")
if start>end:
if start > end:
raise ValueError("end must be equal to or greater than start.")
if count<1:
if count < 1:
raise ValueError("count must be equal to or greater than 1.")
if (end-start)/count < min_interval_sec:
count = int((end-start)/min_interval_sec)
if count == 0 : count = 1
interval= (end-start)/count
if (end - start) / count < min_interval_sec:
count = int((end - start) / min_interval_sec)
if count == 0:
count = 1
interval = (end - start) / count
if count == 1:
return [start]
return sorted( list(set( [int(start + interval*j)
for j in range(count) ])))
return sorted(list(set([int(start + interval * j)
for j in range(count)])))
def ready_blocks(video_id, duration, div, callback):
if div <= 0: raise ValueError
if div <= 0:
raise ValueError
async def _get_blocks( video_id, duration, div, callback):
async def _get_blocks(video_id, duration, div, callback):
async with aiohttp.ClientSession() as session:
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks)
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime = seektime)
continuation = arcparam.getparam(video_id, seektime=seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(MAX_RETRY_COUNT):
try :
async with session.get(url, headers = headers) as resp:
try:
async with session.get(url, headers=headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
break
@@ -76,41 +79,42 @@ def ready_blocks(video_id, duration, div, callback):
first = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
if callback:
callback(actions,last-first)
callback(actions, last - first)
return Block(
continuation = next_continuation,
chat_data = actions,
first = first,
last = last
continuation=next_continuation,
chat_data=actions,
first=first,
last=last
)
"""
fetch initial blocks.
"""
"""
loop = asyncio.get_event_loop()
blocks = loop.run_until_complete(
_get_blocks(video_id, duration, div, callback))
return blocks
def fetch_patch(callback, blocks, video_id):
async def _allocate_workers():
workers = [
ExtractWorker(
fetch = _fetch, block = block,
blocks = blocks, video_id = video_id
fetch=_fetch, block=block,
blocks=blocks, video_id=video_id
)
for block in blocks
]
async with aiohttp.ClientSession() as session:
tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks)
return await asyncio.gather(*tasks)
async def _fetch(continuation,session) -> Patch:
async def _fetch(continuation, session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(MAX_RETRY_COUNT):
try:
async with session.get(url,headers = config.headers) as resp:
async with session.get(url, headers=config.headers) as resp:
chat_json = await resp.text()
continuation, actions = parser.parse(json.loads(chat_json))
break
@@ -126,21 +130,22 @@ def fetch_patch(callback, blocks, video_id):
if callback:
callback(actions, last - first)
return Patch(actions, continuation, first, last)
return Patch(continuation = continuation)
return Patch(continuation=continuation)
"""
allocate workers and assign blocks.
"""
"""
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(_allocate_workers())
except CancelledError:
pass
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
@@ -148,7 +153,7 @@ async def _shutdown():
except asyncio.CancelledError:
pass
def cancel():
loop = asyncio.get_event_loop()
loop.create_task(_shutdown())

View File

@@ -1,14 +1,13 @@
from . import parser
class Block:
"""Block object represents something like a box
"""Block object represents something like a box
to join chunk of chatdata.
Parameter:
---------
first : int :
videoOffsetTimeMs of the first chat_data
videoOffsetTimeMs of the first chat_data
(chat_data[0])
last : int :
videoOffsetTimeMs of the last chat_data.
(chat_data[-1])
@@ -23,15 +22,15 @@ class Block:
continuation : str :
continuation param of last chat data.
chat_data : list
chat_data : list
done : bool :
whether this block has been fetched.
remaining : int :
remaining data to extract.
equals end - last.
is_last : bool :
whether this block is the last one in blocklist.
@@ -39,13 +38,13 @@ class Block:
whether this block is in the process of during_split.
while True, this block is excluded from duplicate split procedure.
"""
__slots__ = ['first','last','end','continuation','chat_data','remaining',
'done','is_last','during_split']
def __init__(self, first = 0, last = 0, end = 0,
continuation = '', chat_data = [], is_last = False,
during_split = False):
__slots__ = ['first', 'last', 'end', 'continuation', 'chat_data', 'remaining',
'done', 'is_last', 'during_split']
def __init__(self, first=0, last=0, end=0,
continuation='', chat_data=[], is_last=False,
during_split=False):
self.first = first
self.last = last
self.end = end

View File

@@ -1,7 +1,8 @@
from . import parser
def check_duplicate(chatdata):
max_range = len(chatdata)-1
max_range = len(chatdata) - 1
tbl_offset = [None] * max_range
tbl_id = [None] * max_range
tbl_type = [None] * max_range
@@ -9,33 +10,31 @@ def check_duplicate(chatdata):
def create_table(chatdata, max_range):
for i in range(max_range):
tbl_offset[i] = parser.get_offset(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_type[i] = parser.get_type(chatdata[i])
def is_duplicate(i, j):
return (
return (
tbl_offset[i] == tbl_offset[j]
and
tbl_id[i] == tbl_id[j]
and
tbl_type[i] == tbl_type[j]
and tbl_id[i] == tbl_id[j]
and tbl_type[i] == tbl_type[j]
)
print("creating table...")
create_table(chatdata,max_range)
create_table(chatdata, max_range)
print("searching duplicate data...")
return [{ "i":{
"index" : i, "id" : parser.get_id(chatdata[i]),
"offsetTime" : parser.get_offset(chatdata[i]),
"type" : parser.get_type(chatdata[i])
},
"j":{
"index" : j, "id" : parser.get_id(chatdata[j]),
"offsetTime" : parser.get_offset(chatdata[j]),
"type" : parser.get_type(chatdata[j])
}
}
for i in range(max_range) for j in range(i+1,max_range)
if is_duplicate(i,j)]
return [{"i": {
"index": i, "id": parser.get_id(chatdata[i]),
"offsetTime": parser.get_offset(chatdata[i]),
"type": parser.get_type(chatdata[i])
},
"j":{
"index": j, "id": parser.get_id(chatdata[j]),
"offsetTime": parser.get_offset(chatdata[j]),
"type": parser.get_type(chatdata[j])
}
}
for i in range(max_range) for j in range(i + 1, max_range)
if is_duplicate(i, j)]
def check_duplicate_offset(chatdata):
@@ -47,27 +46,27 @@ def check_duplicate_offset(chatdata):
def create_table(chatdata, max_range):
for i in range(max_range):
tbl_offset[i] = parser.get_offset(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_type[i] = parser.get_type(chatdata[i])
def is_duplicate(i, j):
return (
return (
tbl_offset[i] == tbl_offset[j]
and
tbl_id[i] == tbl_id[j]
and tbl_id[i] == tbl_id[j]
)
print("creating table...")
create_table(chatdata,max_range)
create_table(chatdata, max_range)
print("searching duplicate data...")
return [{
"index" : i, "id" : tbl_id[i],
"offsetTime" : tbl_offset[i],
"type:" : tbl_type[i]
}
for i in range(max_range-1)
if is_duplicate(i,i+1)]
"index": i, "id": tbl_id[i],
"offsetTime": tbl_offset[i],
"type:": tbl_type[i]
}
for i in range(max_range - 1)
if is_duplicate(i, i + 1)]
def remove_duplicate_head(blocks):
if len(blocks) == 0 or len(blocks) == 1:
@@ -77,64 +76,62 @@ def remove_duplicate_head(blocks):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index+1].chat_data) == 0:
elif len(blocks[index + 1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index].chat_data[0])
id_1 = parser.get_id(blocks[index+1].chat_data[0])
id_1 = parser.get_id(blocks[index + 1].chat_data[0])
type_0 = parser.get_type(blocks[index].chat_data[0])
type_1 = parser.get_type(blocks[index+1].chat_data[0])
type_1 = parser.get_type(blocks[index + 1].chat_data[0])
return (
blocks[index].first == blocks[index+1].first
and
id_0 == id_1
and
type_0 == type_1
blocks[index].first == blocks[index + 1].first
and id_0 == id_1
and type_0 == type_1
)
ret = [blocks[i] for i in range(len(blocks)-1)
if (len(blocks[i].chat_data)>0 and
not is_duplicate_head(i) )]
ret = [blocks[i] for i in range(len(blocks) - 1)
if (len(blocks[i].chat_data) > 0
and not is_duplicate_head(i))]
ret.append(blocks[-1])
return ret
def remove_duplicate_tail(blocks):
if len(blocks) == 0 or len(blocks) == 1:
return blocks
return blocks
def is_duplicate_tail(index):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index-1].chat_data) == 0:
elif len(blocks[index - 1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_0 = parser.get_id(blocks[index - 1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
type_0 = parser.get_type(blocks[index - 1].chat_data[-1])
type_1 = parser.get_type(blocks[index].chat_data[-1])
return (
blocks[index-1].last == blocks[index].last
and
id_0 == id_1
and
type_0 == type_1
blocks[index - 1].last == blocks[index].last
and id_0 == id_1
and type_0 == type_1
)
ret = [blocks[i] for i in range(0,len(blocks))
if i == 0 or not is_duplicate_tail(i) ]
ret = [blocks[i] for i in range(0, len(blocks))
if i == 0 or not is_duplicate_tail(i)]
return ret
def remove_overlap(blocks):
"""
Fix overlapped blocks after ready_blocks().
Align the last offset of each block to the first offset
Align the last offset of each block to the first offset
of next block (equals `end` offset of each block).
"""
if len(blocks) == 0 or len(blocks) == 1:
return blocks
return blocks
for block in blocks:
if block.is_last:
break
if len(block.chat_data)==0:
if len(block.chat_data) == 0:
continue
block_end = block.end
if block.last >= block_end:
@@ -143,14 +140,14 @@ def remove_overlap(blocks):
break
block.chat_data.pop()
block.last = parser.get_offset(line)
block.remaining=0
block.done=True
block.remaining = 0
block.done = True
block.continuation = None
return blocks
def _dump(blocks):
print(f"---------- first last end---")
for i,block in enumerate(blocks):
print(f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")
print("---------- first last end---")
for i, block in enumerate(blocks):
print(
f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")

View File

@@ -1,16 +1,16 @@
from . import asyncdl
from . import duplcheck
from . import parser
from . import duplcheck
from .. videoinfo import VideoInfo
from ... import config
from ... exceptions import InvalidVideoIdException
logger = config.logger(__name__)
headers=config.headers
headers = config.headers
class Extractor:
def __init__(self, video_id, div = 1, callback = None, processor = None):
if not isinstance(div ,int) or div < 1:
def __init__(self, video_id, div=1, callback=None, processor=None):
if not isinstance(div, int) or div < 1:
raise ValueError('div must be positive integer.')
elif div > 10:
div = 10
@@ -33,7 +33,7 @@ class Extractor:
blocks = asyncdl.ready_blocks(
self.video_id, self.duration, self.div, self.callback)
self.blocks = [block for block in blocks if block]
return self
return self
def _remove_duplicate_head(self):
self.blocks = duplcheck.remove_duplicate_head(self.blocks)
@@ -41,10 +41,10 @@ class Extractor:
def _set_block_end(self):
if len(self.blocks) > 0:
for i in range(len(self.blocks)-1):
self.blocks[i].end = self.blocks[i+1].first
self.blocks[-1].end = self.duration*1000
self.blocks[-1].is_last =True
for i in range(len(self.blocks) - 1):
self.blocks[i].end = self.blocks[i + 1].first
self.blocks[-1].end = self.duration * 1000
self.blocks[-1].is_last = True
return self
def _remove_overlap(self):
@@ -62,7 +62,7 @@ class Extractor:
def _combine(self):
ret = []
for block in self.blocks:
ret.extend(block.chat_data)
ret.extend(block.chat_data)
return ret
def _execute_extract_operations(self):
@@ -82,11 +82,12 @@ class Extractor:
return []
data = self._execute_extract_operations()
if self.processor is None:
return data
return data
return self.processor.process(
[{'video_id':None,'timeout':1,'chatdata' : (action
["replayChatItemAction"]["actions"][0] for action in data)}]
)
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
def cancel(self):
asyncdl.cancel()
asyncdl.cancel()

View File

@@ -1,12 +1,12 @@
import json
from ... import config
from ... exceptions import (
ResponseContextError,
NoContentsException,
NoContinuationsException )
from ... exceptions import (
ResponseContextError,
NoContentsException,
NoContinuationsException)
logger = config.logger(__name__)
def parse(jsn):
"""
Parse replay chat data.
@@ -20,12 +20,12 @@ def parse(jsn):
actions : list
"""
if jsn is None:
if jsn is None:
raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'):
raise ResponseContextError(
'video_id is invalid or private/deleted.')
contents=jsn['response'].get('continuationContents')
'video_id is invalid or private/deleted.')
contents = jsn['response'].get('continuationContents')
if contents is None:
raise NoContentsException('No chat data.')
@@ -43,12 +43,12 @@ def parse(jsn):
def get_offset(item):
return int(item['replayChatItemAction']["videoOffsetTimeMsec"])
def get_id(item):
return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].values())[0].get('id')
)[0])['item'].values())[0].get('id')
def get_type(item):
return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].keys())[0]
)[0])['item'].keys())[0]

View File

@@ -2,17 +2,19 @@ from . import parser
from . block import Block
from typing import NamedTuple
class Patch(NamedTuple):
"""
Patch represents chunk of chat data
which is fetched by asyncdl.fetch_patch._fetch().
"""
chats : list = []
continuation : str = None
first : int = None
last : int = None
chats: list = []
continuation: str = None
first: int = None
last: int = None
def fill(block:Block, patch:Patch):
def fill(block: Block, patch: Patch):
block_end = block.end
if patch.last < block_end or block.is_last:
set_patch(block, patch)
@@ -23,32 +25,31 @@ def fill(block:Block, patch:Patch):
break
patch.chats.pop()
set_patch(block, patch._replace(
continuation = None,
last = line_offset
)
continuation=None,
last=line_offset
)
block.remaining=0
block.done=True
)
block.remaining = 0
block.done = True
def split(parent_block:Block, child_block:Block, patch:Patch):
def split(parent_block: Block, child_block: Block, patch: Patch):
parent_block.during_split = False
if patch.first <= parent_block.last:
''' When patch overlaps with parent_block,
discard this block. '''
child_block.continuation = None
''' Leave child_block.during_split == True
''' Leave child_block.during_split == True
to exclude from during_split sequence. '''
return
return
child_block.during_split = False
child_block.first = patch.first
parent_block.end = patch.first
fill(child_block, patch)
def set_patch(block:Block, patch:Patch):
def set_patch(block: Block, patch: Patch):
block.continuation = patch.continuation
block.chat_data.extend(patch.chats)
block.last = patch.last
block.remaining = block.end-block.last
block.remaining = block.end - block.last

View File

@@ -1,8 +1,8 @@
from . import parser
from . block import Block
from . patch import Patch, fill, split
from . patch import fill, split
from ... paramgen import arcparam
class ExtractWorker:
"""
ExtractWorker associates a download session with a block.
@@ -17,18 +17,18 @@ class ExtractWorker:
block : Block :
Block object that includes chat_data
blocks : list :
List of Block(s)
video_id : str :
parent_block : Block :
the block from which current block is splitted
the block from which current block is splitted
"""
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
def __init__(self, fetch, block, blocks, video_id ):
def __init__(self, fetch, block, blocks, video_id):
self.block = block
self.fetch = fetch
self.blocks = blocks
@@ -47,33 +47,35 @@ class ExtractWorker:
if self.parent_block:
split(self.parent_block, self.block, patch)
self.parent_block = None
else:
else:
fill(self.block, patch)
if self.block.continuation is None:
"""finished fetching this block """
self.block.done = True
self.block = _search_new_block(self)
def _search_new_block(worker) -> Block:
index, undone_block = _get_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.last + undone_block.end)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
return Block(continuation=None)
mean = (undone_block.last + undone_block.end) / 2
continuation = arcparam.getparam(worker.video_id, seektime=mean / 1000)
worker.parent_block = undone_block
worker.parent_block.during_split = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
during_split = True,
is_last = worker.parent_block.is_last)
end=undone_block.end,
chat_data=[],
continuation=continuation,
during_split=True,
is_last=worker.parent_block.is_last)
'''swap last block'''
if worker.parent_block.is_last:
worker.parent_block.is_last = False
worker.blocks.insert(index+1, new_block)
worker.blocks.insert(index + 1, new_block)
return new_block
def _get_undone_block(blocks) -> (int, Block):
min_interval_ms = 120000
max_remaining = 0

View File

@@ -1,15 +1,14 @@
import json
import json
import re
import requests
from .. import config
from .. import util
from ..exceptions import InvalidVideoIdException
from ..exceptions import InvalidVideoIdException
headers = config.headers
pattern = re.compile(r"yt\.setConfig\({'PLAYER_CONFIG': ({.*})}\);")
item_channel_id =[
item_channel_id = [
"videoDetails",
"embeddedPlayerOverlayVideoDetailsRenderer",
"channelThumbnailEndpoint",
@@ -29,7 +28,7 @@ item_response = [
"embedded_player_response"
]
item_author_image =[
item_author_image = [
"videoDetails",
"embeddedPlayerOverlayVideoDetailsRenderer",
"channelThumbnail",
@@ -63,6 +62,7 @@ item_moving_thumbnail = [
"url"
]
class VideoInfo:
'''
VideoInfo object retrieves YouTube video information.
@@ -76,6 +76,7 @@ class VideoInfo:
InvalidVideoIdException :
Occurs when video_id does not exist on YouTube.
'''
def __init__(self, video_id):
self.video_id = video_id
text = self._get_page_text(video_id)
@@ -83,13 +84,13 @@ class VideoInfo:
def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}"
resp = requests.get(url, headers = headers)
resp = requests.get(url, headers=headers)
resp.raise_for_status()
return resp.text
def _parse(self, text):
result = re.search(pattern, text)
res= json.loads(result.group(1))
res = json.loads(result.group(1))
response = self._get_item(res, item_response)
if response is None:
self._check_video_is_private(res.get("args"))
@@ -98,7 +99,7 @@ class VideoInfo:
raise InvalidVideoIdException(
f"No renderer found in video_id: [{self.video_id}].")
def _check_video_is_private(self,args):
def _check_video_is_private(self, args):
if args and args.get("video_id"):
raise InvalidVideoIdException(
f"video_id [{self.video_id}] is private or deleted.")
@@ -130,8 +131,8 @@ class VideoInfo:
def get_title(self):
if self._renderer.get("title"):
return [''.join(run["text"])
for run in self._renderer["title"]["runs"]][0]
return [''.join(run["text"])
for run in self._renderer["title"]["runs"]][0]
return None
def get_channel_id(self):
@@ -141,13 +142,13 @@ class VideoInfo:
return None
def get_author_image(self):
return self._get_item(self._renderer, item_author_image)
return self._get_item(self._renderer, item_author_image)
def get_thumbnail(self):
return self._get_item(self._renderer, item_thumbnail)
def get_channel_name(self):
return self._get_item(self._renderer, item_channel_name)
def get_moving_thumbnail(self):
return self._get_item(self._renderer, item_moving_thumbnail)