Aggregate return values with patch class

This commit is contained in:
taizan-hokuto
2020-02-16 20:43:12 +09:00
parent 6fdb3bf8cf
commit c4cf424702
23 changed files with 35254 additions and 327 deletions

View File

@@ -1,153 +0,0 @@
from . import parser
from .. paramgen import arcparam
from . block import Block
class DownloadWorker:
"""
DownloadWorker associates a download session with a block.
Parameter
----------
fetch : func :
download function of asyncdl
block : Block :
Block object associated with this worker
blocks : list :
List of Block(s)
video_id : str :
source_block : Block :
the Block from which current downloading block is splitted
"""
__slots__ = ['fetch', 'block', 'blocks', 'video_id', 'source_block']
def __init__(self, fetch, block, blocks, video_id ):
self.block = block
self.fetch = fetch
self.blocks = blocks
self.video_id = video_id
self.source_block = None
async def run(self, session):
"""Remove extra chats just after ready_blocks(). """
continuation = initial_fill(self.block)
"""download loop """
while continuation:
chats, new_cont, fetched_first, fetched_last = await self.fetch(
continuation, session)
if fetched_first is None:
break
if self.source_block:
continuation = split_fill(
self.source_block, self.block, chats, new_cont,
fetched_first, fetched_last)
self.source_block = None
else:
continuation = fill(self.block, chats, new_cont, fetched_last)
if continuation is None:
new_block = get_new_block(self)
self.block = new_block
continuation = new_block.continuation
def get_new_block(worker) -> Block:
worker.block.done = True
index,undone_block = search_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.end + undone_block.last)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
worker.source_block = undone_block
worker.source_block.splitting = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
splitting = True,
is_last = worker.source_block.is_last)
worker.blocks.insert(index+1,new_block)
return new_block
def search_undone_block(blocks) -> (int, Block):
"""
Returns
--------
ret_index : int :
index of Block download not completed in blocks .
ret_block : Block :
Block download not completed.
"""
max_remaining = 0
ret_block = None
ret_index = 0
for index, block in enumerate(blocks):
if block.done or block.splitting:
continue
remaining = block.remaining
if remaining > max_remaining and remaining > 120000:
ret_index = index
ret_block = block
max_remaining = remaining
return ret_index, ret_block
def top_cut(chats, last) -> list:
for i, chat in enumerate(chats):
if parser.get_offset(chat) > last:
return chats[i:]
return []
def bottom_cut(chats, last) -> list:
for rchat in reversed(chats):
if parser.get_offset(rchat)>=last:
chats.pop()
else:
break
return chats
def split_fill(source_block, block, chats, new_cont,
fetched_first, fetched_last):
if fetched_last <= source_block.last:
return None
block.splitting = False
source_block.splitting = False
source_block.end = fetched_first
block.first = fetched_first
block.last = fetched_last
continuation = new_cont
if fetched_first < source_block.last:
chats = top_cut(chats, source_block.last)
block.first = source_block.last
if block.end < fetched_last:
chats = bottom_cut(chats, block.end)
block.last = block.end
continuation = None
block.chat_data.extend(chats)
block.continuation = continuation
return continuation
def initial_fill(block):
chats, cont = get_chats(block, block.chat_data, block.continuation, block.last)
block.chat_data = chats
return cont
def fill(block, chats, cont, fetched_last):
chats, cont = get_chats(block, chats, cont, fetched_last)
block.chat_data.extend(chats)
return cont
def get_chats(block, chats, cont, fetched_last):
block.last = fetched_last
if fetched_last < block.end or block.is_last:
block.last = fetched_last
block.remaining=block.end-block.last
return chats, cont
for i, line in enumerate(chats):
line_offset = parser.get_offset(line)
if line_offset >= block.end:
block.last = line_offset
block.remaining = 0
block.done = True
return chats[:i], None

View File

View File

@@ -5,16 +5,17 @@ import json
from . import parser
from . block import Block
from . dlworker import DownloadWorker
from .. paramgen import arcparam
from .. import config
from urllib.parse import quote
from . patch import Patch
from ... import config
from ... paramgen import arcparam
from concurrent.futures import CancelledError
from urllib.parse import quote
headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
def _split(start, end, count, min_interval = 120):
def _split(start, end, count, min_interval_sec = 120):
"""
Split section from `start` to `end` into `count` pieces,
and returns the beginning of each piece.
@@ -23,7 +24,7 @@ def _split(start, end, count, min_interval = 120):
Returns:
--------
List of the beginning position of each piece.
List of the offset of each block's first chat data.
"""
if not (isinstance(start,int) or isinstance(start,float)) or \
@@ -35,14 +36,14 @@ def _split(start, end, count, min_interval = 120):
raise ValueError("end must be equal to or greater than start.")
if count<1:
raise ValueError("count must be equal to or greater than 1.")
if (end-start)/count < min_interval:
count = int((end-start)/min_interval)
if (end-start)/count < min_interval_sec:
count = int((end-start)/min_interval_sec)
if count == 0 : count = 1
interval= (end-start)/count
if count == 1:
return [start]
return sorted(list(set([int(start+interval*j)
return sorted( list(set( [int(start + interval*j)
for j in range(count) ])))
def ready_blocks(video_id, duration, div, callback):
@@ -50,27 +51,16 @@ def ready_blocks(video_id, duration, div, callback):
async def _get_blocks( video_id, duration, div, callback):
async with aiohttp.ClientSession() as session:
tasks = [_create_block(session, video_id, pos, seektime, callback)
for pos, seektime in enumerate(_split(-1, duration, div))]
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks)
async def _create_block(session, video_id, pos, seektime, callback):
continuation = arcparam.getparam(
video_id, seektime = seektime)
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime = seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(3):
try:
async with session.get(url, headers = headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
except json.JSONDecodeError:
print("JSONDecodeError occured")
await asyncio.sleep(1)
continue
break
else:
raise json.JSONDecodeError
async with session.get(url, headers = headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
if actions:
first = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
@@ -82,59 +72,50 @@ def ready_blocks(video_id, duration, div, callback):
first = first,
last = last
)
"""
fetch initial blocks.
"""
loop = asyncio.get_event_loop()
result = loop.run_until_complete(
blocks = loop.run_until_complete(
_get_blocks(video_id, duration, div, callback))
return result
return blocks
def download_chunk(callback, blocks, video_id):
def download_patch(callback, blocks, video_id):
async def _allocate_workers():
workers = [
DownloadWorker(
fetch = _fetch,
block = block,
blocks = blocks,
video_id = video_id
fetch = _fetch, block = block,
blocks = blocks, video_id = video_id
)
for i,block in enumerate(blocks)
for block in blocks
]
async with aiohttp.ClientSession() as session:
tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks)
async def _fetch(continuation,session):
async def _fetch(continuation,session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(3):
try:
async with session.get(url,headers = config.headers) as resp:
chat_json = await resp.text()
except json.JSONDecodeError:
print("JSONDecodeError occured")
await asyncio.sleep(1)
continue
break
else:
raise json.JSONDecodeError
async with session.get(url,headers = config.headers) as resp:
chat_json = await resp.text()
continuation, actions = parser.parse(json.loads(chat_json))
if actions:
last = parser.get_offset(actions[-1])
first = parser.get_offset(actions[0])
if callback:
callback(actions, last - first)
return actions, continuation, first, last
return [], continuation, None, None
return Patch(actions, continuation, first, last)
return Patch()
"""
allocate workers and assign blocks.
"""
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(_allocate_workers())
except CancelledError:
pass
async def shutdown():
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
@@ -147,5 +128,5 @@ async def shutdown():
def cancel():
loop = asyncio.get_event_loop()
loop.create_task(shutdown())
loop.create_task(_shutdown())

View File

@@ -1,12 +1,10 @@
from . import parser
class Block:
"""Block object represents virtual chunk of chatdata.
"""Block object represents something like a box
to join chunk of chatdata.
Parameter:
---------
pos : int :
index of this block on block list.
first : int :
videoOffsetTimeMs of the first chat_data
(chat_data[0])
@@ -37,23 +35,23 @@ class Block:
is_last : bool :
whether this block is the last one in blocklist.
splitting : bool :
whether this block is in the process of splitting.
during_split : bool :
whether this block is in the process of during_split.
while True, this block is excluded from duplicate split procedure.
"""
__slots__ = ['first','last','end','continuation','chat_data','remaining',
'done','is_last','splitting']
'done','is_last','during_split']
def __init__(self, first = 0, last = 0, end = 0,
def __init__(self, first = 0, last = 0, end = 0,
continuation = '', chat_data = [], is_last = False,
splitting = False):
during_split = False):
self.first = first
self.last = last
self.end = end
self.continuation = continuation
self.chat_data = chat_data
self.done = False
self.remaining = self.end- self.last
self.remaining = self.end - self.last
self.is_last = is_last
self.splitting = splitting
self.during_split = during_split

View File

@@ -0,0 +1,87 @@
from . import parser
from . block import Block
from . patch import Patch, fill, split
from ... paramgen import arcparam
class DownloadWorker:
"""
DownloadWorker associates a download session with a block.
When the dlworker finishes downloading, the block
being downloaded is splitted and assigned the free dlworker.
Parameter
----------
fetch : func :
download function of asyncdl
block : Block :
Block object that includes chat_data
blocks : list :
List of Block(s)
video_id : str :
parent_block : Block :
the block from which current block is splitted
"""
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
def __init__(self, fetch, block, blocks, video_id ):
self.block = block
self.fetch = fetch
self.blocks = blocks
self.video_id = video_id
self.parent_block = None
async def run(self, session):
while self.block.continuation:
patch = await self.fetch(
self.block.continuation, session)
if patch.continuation is None:
"""TODO : make the dlworker assigned to the last block
to work more than twice as possible.
"""
break
if self.parent_block:
split(self.parent_block, self.block, patch)
self.parent_block = None
else:
fill(self.block, patch)
if self.block.continuation is None:
"""finished downloading this block """
self.block.done = True
self.block = _search_new_block(self)
def _search_new_block(worker) -> Block:
index, undone_block = _get_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.last + undone_block.end)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
worker.parent_block = undone_block
worker.parent_block.during_split = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
during_split = True,
is_last = worker.parent_block.is_last)
worker.blocks.insert(index+1, new_block)
return new_block
def _get_undone_block(blocks) -> (int, Block):
min_interval_ms = 120000
max_remaining = 0
undone_block = None
index_undone_block = 0
for index, block in enumerate(blocks):
if block.done or block.during_split:
continue
remaining = block.remaining
if remaining > max_remaining and remaining > min_interval_ms:
index_undone_block = index
undone_block = block
max_remaining = remaining
return index_undone_block, undone_block

View File

@@ -1,9 +1,9 @@
from . import asyncdl
from . import duplcheck
from . import parser
from . duplcheck import duplicate_head, duplicate_tail, overwrap
from . videoinfo import VideoInfo
from .. import config
from .. exceptions import InvalidVideoIdException
from .. videoinfo import VideoInfo
from ... import config
from ... exceptions import InvalidVideoIdException
logger = config.logger(__name__)
headers=config.headers
@@ -22,36 +22,36 @@ class Downloader:
self.callback = callback
self.blocks = []
def ready_blocks(self):
result = asyncdl.ready_blocks(
def _ready_blocks(self):
blocks = asyncdl.ready_blocks(
self.video_id, self.duration, self.div, self.callback)
self.blocks = [block for block in result if block]
self.blocks = [block for block in blocks if block]
return self
def remove_duplicate_head(self):
self.blocks = duplicate_head(self.blocks)
def _remove_duplicate_head(self):
self.blocks = duplcheck.remove_duplicate_head(self.blocks)
return self
def set_temporary_last(self):
def _set_block_end(self):
for i in range(len(self.blocks)-1):
self.blocks[i].end = self.blocks[i+1].first
self.blocks[-1].end = self.duration*1000
self.blocks[-1].is_last =True
return self
def remove_overwrap(self):
self.blocks = overwrap(self.blocks)
def _remove_overlap(self):
self.blocks = duplcheck.remove_overlap(self.blocks)
return self
def download_blocks(self):
asyncdl.download_chunk(self.callback, self.blocks, self.video_id)
def _download_blocks(self):
asyncdl.download_patch(self.callback, self.blocks, self.video_id)
return self
def remove_duplicate_tail(self):
self.blocks = duplicate_tail(self.blocks)
def _remove_duplicate_tail(self):
self.blocks = duplcheck.remove_duplicate_tail(self.blocks)
return self
def combine(self):
def _combine(self):
ret = []
for block in self.blocks:
ret.extend(block.chat_data)
@@ -59,13 +59,13 @@ class Downloader:
def download(self):
return (
self.ready_blocks()
.remove_duplicate_head()
.remove_overwrap()
.set_temporary_last()
.download_blocks()
.remove_duplicate_tail()
.combine()
self._ready_blocks()
._remove_duplicate_head()
._set_block_end()
._remove_overlap()
._download_blocks()
._remove_duplicate_tail()
._combine()
)
def download(video_id, div = 1, callback = None, processor = None):
@@ -86,4 +86,4 @@ def download(video_id, div = 1, callback = None, processor = None):
)
def cancel():
asyncdl.cancel()
asyncdl.cancel()

View File

@@ -20,18 +20,18 @@ def check_duplicate(chatdata):
and
tbl_type[i] == tbl_type[j]
)
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate data...")
return [{ "i":{
"index" : i, "id" : parser.get_id(chatdata[i]),
"offsetTime" : parser.get_offset(chatdata[i])
"offsetTime" : parser.get_offset(chatdata[i]),
"type" : parser.get_type(chatdata[i])
},
"j":{
"index" : j, "id" : parser.get_id(chatdata[j]),
"offsetTime" : parser.get_offset(chatdata[j])
"offsetTime" : parser.get_offset(chatdata[j]),
"type" : parser.get_type(chatdata[j])
}
}
for i in range(max_range) for j in range(i+1,max_range)
@@ -59,18 +59,17 @@ def check_duplicate_offset(chatdata):
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate offset data...")
print("searching duplicate data...")
return [{
"index" : i, "id" : tbl_id[i],
"offsetTime" : tbl_offset[i],
"type:" : tbl_type[i]
}
for i in range(max_range-1)
if is_duplicate(i,i+1)]
def duplicate_head(blocks):
def remove_duplicate_head(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_head(index):
@@ -97,16 +96,14 @@ def duplicate_head(blocks):
ret.append(blocks[-1])
return ret
def duplicate_tail(blocks):
def remove_duplicate_tail(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_tail(index):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index-1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
@@ -123,32 +120,34 @@ def duplicate_tail(blocks):
if i == 0 or not is_duplicate_tail(i) ]
return ret
def overwrap(blocks):
def remove_overlap(blocks):
"""
Fix overlapped blocks after ready_blocks().
Align the last offset of each block to the first offset
of next block (equals `end` offset of each block).
"""
if len(blocks) == 1 : return blocks
ret = []
a = 0
b = 1
jmp = False
ret.append(blocks[0])
while a < len(blocks)-2:
while blocks[a].last > blocks[b].first:
b+=1
if b == len(blocks)-1:
jmp = True
break
if jmp: break
if b-a == 1:
a = b
else:
a = b-1
ret.append(blocks[a])
b = a+1
ret.append(blocks[-1])
return ret
for block in blocks:
if block.is_last:
break
if len(block.chat_data)==0:
continue
block_end = block.end
if block.last >= block_end:
for line in reversed(block.chat_data):
if parser.get_offset(line) < block_end:
break
block.chat_data.pop()
block.last = parser.get_offset(line)
block.remaining=0
block.done=True
block.continuation = None
return blocks
def _dump(blocks):
print(__name__)
print(f"---------- first last end {'':>3}---")
print(f"---------- first last end---")
for i,block in enumerate(blocks):
print(f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")

View File

@@ -1,6 +1,6 @@
import json
from .. import config
from .. exceptions import (
from ... import config
from ... exceptions import (
ResponseContextError,
NoContentsException,
NoContinuationsException )
@@ -23,15 +23,15 @@ def parse(jsn):
if jsn is None:
raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'):
raise ResponseContextError('動画に接続できません。'
'動画IDが間違っているか、動画が削除非公開の可能性があります。')
raise ResponseContextError(
'video_id is invalid or private/deleted.')
contents=jsn['response'].get('continuationContents')
if contents is None:
raise NoContentsException('チャットデータを取得できませんでした。')
raise NoContentsException('No chat data.')
cont = contents['liveChatContinuation']['continuations'][0]
if cont is None:
raise NoContinuationsException('Continuationがありません。')
raise NoContinuationsException('No Continuation')
metadata = cont.get('liveChatReplayContinuationData')
if metadata:
continuation = metadata.get("continuation")

View File

@@ -0,0 +1,54 @@
from . import parser
from . block import Block
from typing import NamedTuple
class Patch(NamedTuple):
"""
Patch represents chunk of chat data
which is fetched by asyncdl.download_patch._fetch().
"""
chats : list = []
continuation : str = None
first : int = None
last : int = None
def fill(block:Block, patch:Patch):
block_end = block.end
if patch.last < block_end or block.is_last:
set_patch(block, patch)
return
for line in reversed(patch.chats):
line_offset = parser.get_offset(line)
if line_offset < block_end:
break
patch.chats.pop()
set_patch(block, patch._replace(
continuation = None,
last = line_offset
)
)
block.remaining=0
block.done=True
def split(parent_block:Block, child_block:Block, patch:Patch):
parent_block.during_split = False
"""patch overlaps with parent_block"""
if patch.first <= parent_block.last:
child_block.continuation = None
''' Leave child_block.during_split == True
to exclude from during_split sequence.'''
return
child_block.during_split = False
child_block.first=patch.first
parent_block.end =patch.first
fill(child_block, patch)
def set_patch(block:Block, patch:Patch):
block.continuation = patch.continuation
block.chat_data.extend(patch.chats)
block.last = patch.last
block.remaining = block.end-block.last