Aggregate return values with patch class

This commit is contained in:
taizan-hokuto
2020-02-16 20:43:12 +09:00
parent 6fdb3bf8cf
commit c4cf424702
23 changed files with 35254 additions and 327 deletions

View File

@@ -1,153 +0,0 @@
from . import parser
from .. paramgen import arcparam
from . block import Block
class DownloadWorker:
"""
DownloadWorker associates a download session with a block.
Parameter
----------
fetch : func :
download function of asyncdl
block : Block :
Block object associated with this worker
blocks : list :
List of Block(s)
video_id : str :
source_block : Block :
the Block from which current downloading block is splitted
"""
__slots__ = ['fetch', 'block', 'blocks', 'video_id', 'source_block']
def __init__(self, fetch, block, blocks, video_id ):
self.block = block
self.fetch = fetch
self.blocks = blocks
self.video_id = video_id
self.source_block = None
async def run(self, session):
"""Remove extra chats just after ready_blocks(). """
continuation = initial_fill(self.block)
"""download loop """
while continuation:
chats, new_cont, fetched_first, fetched_last = await self.fetch(
continuation, session)
if fetched_first is None:
break
if self.source_block:
continuation = split_fill(
self.source_block, self.block, chats, new_cont,
fetched_first, fetched_last)
self.source_block = None
else:
continuation = fill(self.block, chats, new_cont, fetched_last)
if continuation is None:
new_block = get_new_block(self)
self.block = new_block
continuation = new_block.continuation
def get_new_block(worker) -> Block:
worker.block.done = True
index,undone_block = search_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.end + undone_block.last)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
worker.source_block = undone_block
worker.source_block.splitting = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
splitting = True,
is_last = worker.source_block.is_last)
worker.blocks.insert(index+1,new_block)
return new_block
def search_undone_block(blocks) -> (int, Block):
"""
Returns
--------
ret_index : int :
index of Block download not completed in blocks .
ret_block : Block :
Block download not completed.
"""
max_remaining = 0
ret_block = None
ret_index = 0
for index, block in enumerate(blocks):
if block.done or block.splitting:
continue
remaining = block.remaining
if remaining > max_remaining and remaining > 120000:
ret_index = index
ret_block = block
max_remaining = remaining
return ret_index, ret_block
def top_cut(chats, last) -> list:
for i, chat in enumerate(chats):
if parser.get_offset(chat) > last:
return chats[i:]
return []
def bottom_cut(chats, last) -> list:
for rchat in reversed(chats):
if parser.get_offset(rchat)>=last:
chats.pop()
else:
break
return chats
def split_fill(source_block, block, chats, new_cont,
fetched_first, fetched_last):
if fetched_last <= source_block.last:
return None
block.splitting = False
source_block.splitting = False
source_block.end = fetched_first
block.first = fetched_first
block.last = fetched_last
continuation = new_cont
if fetched_first < source_block.last:
chats = top_cut(chats, source_block.last)
block.first = source_block.last
if block.end < fetched_last:
chats = bottom_cut(chats, block.end)
block.last = block.end
continuation = None
block.chat_data.extend(chats)
block.continuation = continuation
return continuation
def initial_fill(block):
chats, cont = get_chats(block, block.chat_data, block.continuation, block.last)
block.chat_data = chats
return cont
def fill(block, chats, cont, fetched_last):
chats, cont = get_chats(block, chats, cont, fetched_last)
block.chat_data.extend(chats)
return cont
def get_chats(block, chats, cont, fetched_last):
block.last = fetched_last
if fetched_last < block.end or block.is_last:
block.last = fetched_last
block.remaining=block.end-block.last
return chats, cont
for i, line in enumerate(chats):
line_offset = parser.get_offset(line)
if line_offset >= block.end:
block.last = line_offset
block.remaining = 0
block.done = True
return chats[:i], None

View File

View File

@@ -5,16 +5,17 @@ import json
from . import parser
from . block import Block
from . dlworker import DownloadWorker
from .. paramgen import arcparam
from .. import config
from urllib.parse import quote
from . patch import Patch
from ... import config
from ... paramgen import arcparam
from concurrent.futures import CancelledError
from urllib.parse import quote
headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
def _split(start, end, count, min_interval = 120):
def _split(start, end, count, min_interval_sec = 120):
"""
Split section from `start` to `end` into `count` pieces,
and returns the beginning of each piece.
@@ -23,7 +24,7 @@ def _split(start, end, count, min_interval = 120):
Returns:
--------
List of the beginning position of each piece.
List of the offset of each block's first chat data.
"""
if not (isinstance(start,int) or isinstance(start,float)) or \
@@ -35,14 +36,14 @@ def _split(start, end, count, min_interval = 120):
raise ValueError("end must be equal to or greater than start.")
if count<1:
raise ValueError("count must be equal to or greater than 1.")
if (end-start)/count < min_interval:
count = int((end-start)/min_interval)
if (end-start)/count < min_interval_sec:
count = int((end-start)/min_interval_sec)
if count == 0 : count = 1
interval= (end-start)/count
if count == 1:
return [start]
return sorted(list(set([int(start+interval*j)
return sorted( list(set( [int(start + interval*j)
for j in range(count) ])))
def ready_blocks(video_id, duration, div, callback):
@@ -50,27 +51,16 @@ def ready_blocks(video_id, duration, div, callback):
async def _get_blocks( video_id, duration, div, callback):
async with aiohttp.ClientSession() as session:
tasks = [_create_block(session, video_id, pos, seektime, callback)
for pos, seektime in enumerate(_split(-1, duration, div))]
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks)
async def _create_block(session, video_id, pos, seektime, callback):
continuation = arcparam.getparam(
video_id, seektime = seektime)
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime = seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(3):
try:
async with session.get(url, headers = headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
except json.JSONDecodeError:
print("JSONDecodeError occured")
await asyncio.sleep(1)
continue
break
else:
raise json.JSONDecodeError
async with session.get(url, headers = headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
if actions:
first = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
@@ -82,59 +72,50 @@ def ready_blocks(video_id, duration, div, callback):
first = first,
last = last
)
"""
fetch initial blocks.
"""
loop = asyncio.get_event_loop()
result = loop.run_until_complete(
blocks = loop.run_until_complete(
_get_blocks(video_id, duration, div, callback))
return result
return blocks
def download_chunk(callback, blocks, video_id):
def download_patch(callback, blocks, video_id):
async def _allocate_workers():
workers = [
DownloadWorker(
fetch = _fetch,
block = block,
blocks = blocks,
video_id = video_id
fetch = _fetch, block = block,
blocks = blocks, video_id = video_id
)
for i,block in enumerate(blocks)
for block in blocks
]
async with aiohttp.ClientSession() as session:
tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks)
async def _fetch(continuation,session):
async def _fetch(continuation,session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
for _ in range(3):
try:
async with session.get(url,headers = config.headers) as resp:
chat_json = await resp.text()
except json.JSONDecodeError:
print("JSONDecodeError occured")
await asyncio.sleep(1)
continue
break
else:
raise json.JSONDecodeError
async with session.get(url,headers = config.headers) as resp:
chat_json = await resp.text()
continuation, actions = parser.parse(json.loads(chat_json))
if actions:
last = parser.get_offset(actions[-1])
first = parser.get_offset(actions[0])
if callback:
callback(actions, last - first)
return actions, continuation, first, last
return [], continuation, None, None
return Patch(actions, continuation, first, last)
return Patch()
"""
allocate workers and assign blocks.
"""
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(_allocate_workers())
except CancelledError:
pass
async def shutdown():
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
@@ -147,5 +128,5 @@ async def shutdown():
def cancel():
loop = asyncio.get_event_loop()
loop.create_task(shutdown())
loop.create_task(_shutdown())

View File

@@ -1,12 +1,10 @@
from . import parser
class Block:
"""Block object represents virtual chunk of chatdata.
"""Block object represents something like a box
to join chunk of chatdata.
Parameter:
---------
pos : int :
index of this block on block list.
first : int :
videoOffsetTimeMs of the first chat_data
(chat_data[0])
@@ -37,23 +35,23 @@ class Block:
is_last : bool :
whether this block is the last one in blocklist.
splitting : bool :
whether this block is in the process of splitting.
during_split : bool :
whether this block is in the process of during_split.
while True, this block is excluded from duplicate split procedure.
"""
__slots__ = ['first','last','end','continuation','chat_data','remaining',
'done','is_last','splitting']
'done','is_last','during_split']
def __init__(self, first = 0, last = 0, end = 0,
def __init__(self, first = 0, last = 0, end = 0,
continuation = '', chat_data = [], is_last = False,
splitting = False):
during_split = False):
self.first = first
self.last = last
self.end = end
self.continuation = continuation
self.chat_data = chat_data
self.done = False
self.remaining = self.end- self.last
self.remaining = self.end - self.last
self.is_last = is_last
self.splitting = splitting
self.during_split = during_split

View File

@@ -0,0 +1,87 @@
from . import parser
from . block import Block
from . patch import Patch, fill, split
from ... paramgen import arcparam
class DownloadWorker:
"""
DownloadWorker associates a download session with a block.
When the dlworker finishes downloading, the block
being downloaded is splitted and assigned the free dlworker.
Parameter
----------
fetch : func :
download function of asyncdl
block : Block :
Block object that includes chat_data
blocks : list :
List of Block(s)
video_id : str :
parent_block : Block :
the block from which current block is splitted
"""
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
def __init__(self, fetch, block, blocks, video_id ):
self.block = block
self.fetch = fetch
self.blocks = blocks
self.video_id = video_id
self.parent_block = None
async def run(self, session):
while self.block.continuation:
patch = await self.fetch(
self.block.continuation, session)
if patch.continuation is None:
"""TODO : make the dlworker assigned to the last block
to work more than twice as possible.
"""
break
if self.parent_block:
split(self.parent_block, self.block, patch)
self.parent_block = None
else:
fill(self.block, patch)
if self.block.continuation is None:
"""finished downloading this block """
self.block.done = True
self.block = _search_new_block(self)
def _search_new_block(worker) -> Block:
index, undone_block = _get_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.last + undone_block.end)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
worker.parent_block = undone_block
worker.parent_block.during_split = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
during_split = True,
is_last = worker.parent_block.is_last)
worker.blocks.insert(index+1, new_block)
return new_block
def _get_undone_block(blocks) -> (int, Block):
min_interval_ms = 120000
max_remaining = 0
undone_block = None
index_undone_block = 0
for index, block in enumerate(blocks):
if block.done or block.during_split:
continue
remaining = block.remaining
if remaining > max_remaining and remaining > min_interval_ms:
index_undone_block = index
undone_block = block
max_remaining = remaining
return index_undone_block, undone_block

View File

@@ -1,9 +1,9 @@
from . import asyncdl
from . import duplcheck
from . import parser
from . duplcheck import duplicate_head, duplicate_tail, overwrap
from . videoinfo import VideoInfo
from .. import config
from .. exceptions import InvalidVideoIdException
from .. videoinfo import VideoInfo
from ... import config
from ... exceptions import InvalidVideoIdException
logger = config.logger(__name__)
headers=config.headers
@@ -22,36 +22,36 @@ class Downloader:
self.callback = callback
self.blocks = []
def ready_blocks(self):
result = asyncdl.ready_blocks(
def _ready_blocks(self):
blocks = asyncdl.ready_blocks(
self.video_id, self.duration, self.div, self.callback)
self.blocks = [block for block in result if block]
self.blocks = [block for block in blocks if block]
return self
def remove_duplicate_head(self):
self.blocks = duplicate_head(self.blocks)
def _remove_duplicate_head(self):
self.blocks = duplcheck.remove_duplicate_head(self.blocks)
return self
def set_temporary_last(self):
def _set_block_end(self):
for i in range(len(self.blocks)-1):
self.blocks[i].end = self.blocks[i+1].first
self.blocks[-1].end = self.duration*1000
self.blocks[-1].is_last =True
return self
def remove_overwrap(self):
self.blocks = overwrap(self.blocks)
def _remove_overlap(self):
self.blocks = duplcheck.remove_overlap(self.blocks)
return self
def download_blocks(self):
asyncdl.download_chunk(self.callback, self.blocks, self.video_id)
def _download_blocks(self):
asyncdl.download_patch(self.callback, self.blocks, self.video_id)
return self
def remove_duplicate_tail(self):
self.blocks = duplicate_tail(self.blocks)
def _remove_duplicate_tail(self):
self.blocks = duplcheck.remove_duplicate_tail(self.blocks)
return self
def combine(self):
def _combine(self):
ret = []
for block in self.blocks:
ret.extend(block.chat_data)
@@ -59,13 +59,13 @@ class Downloader:
def download(self):
return (
self.ready_blocks()
.remove_duplicate_head()
.remove_overwrap()
.set_temporary_last()
.download_blocks()
.remove_duplicate_tail()
.combine()
self._ready_blocks()
._remove_duplicate_head()
._set_block_end()
._remove_overlap()
._download_blocks()
._remove_duplicate_tail()
._combine()
)
def download(video_id, div = 1, callback = None, processor = None):
@@ -86,4 +86,4 @@ def download(video_id, div = 1, callback = None, processor = None):
)
def cancel():
asyncdl.cancel()
asyncdl.cancel()

View File

@@ -20,18 +20,18 @@ def check_duplicate(chatdata):
and
tbl_type[i] == tbl_type[j]
)
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate data...")
return [{ "i":{
"index" : i, "id" : parser.get_id(chatdata[i]),
"offsetTime" : parser.get_offset(chatdata[i])
"offsetTime" : parser.get_offset(chatdata[i]),
"type" : parser.get_type(chatdata[i])
},
"j":{
"index" : j, "id" : parser.get_id(chatdata[j]),
"offsetTime" : parser.get_offset(chatdata[j])
"offsetTime" : parser.get_offset(chatdata[j]),
"type" : parser.get_type(chatdata[j])
}
}
for i in range(max_range) for j in range(i+1,max_range)
@@ -59,18 +59,17 @@ def check_duplicate_offset(chatdata):
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate offset data...")
print("searching duplicate data...")
return [{
"index" : i, "id" : tbl_id[i],
"offsetTime" : tbl_offset[i],
"type:" : tbl_type[i]
}
for i in range(max_range-1)
if is_duplicate(i,i+1)]
def duplicate_head(blocks):
def remove_duplicate_head(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_head(index):
@@ -97,16 +96,14 @@ def duplicate_head(blocks):
ret.append(blocks[-1])
return ret
def duplicate_tail(blocks):
def remove_duplicate_tail(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_tail(index):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index-1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
@@ -123,32 +120,34 @@ def duplicate_tail(blocks):
if i == 0 or not is_duplicate_tail(i) ]
return ret
def overwrap(blocks):
def remove_overlap(blocks):
"""
Fix overlapped blocks after ready_blocks().
Align the last offset of each block to the first offset
of next block (equals `end` offset of each block).
"""
if len(blocks) == 1 : return blocks
ret = []
a = 0
b = 1
jmp = False
ret.append(blocks[0])
while a < len(blocks)-2:
while blocks[a].last > blocks[b].first:
b+=1
if b == len(blocks)-1:
jmp = True
break
if jmp: break
if b-a == 1:
a = b
else:
a = b-1
ret.append(blocks[a])
b = a+1
ret.append(blocks[-1])
return ret
for block in blocks:
if block.is_last:
break
if len(block.chat_data)==0:
continue
block_end = block.end
if block.last >= block_end:
for line in reversed(block.chat_data):
if parser.get_offset(line) < block_end:
break
block.chat_data.pop()
block.last = parser.get_offset(line)
block.remaining=0
block.done=True
block.continuation = None
return blocks
def _dump(blocks):
print(__name__)
print(f"---------- first last end {'':>3}---")
print(f"---------- first last end---")
for i,block in enumerate(blocks):
print(f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")

View File

@@ -1,6 +1,6 @@
import json
from .. import config
from .. exceptions import (
from ... import config
from ... exceptions import (
ResponseContextError,
NoContentsException,
NoContinuationsException )
@@ -23,15 +23,15 @@ def parse(jsn):
if jsn is None:
raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'):
raise ResponseContextError('動画に接続できません。'
'動画IDが間違っているか、動画が削除非公開の可能性があります。')
raise ResponseContextError(
'video_id is invalid or private/deleted.')
contents=jsn['response'].get('continuationContents')
if contents is None:
raise NoContentsException('チャットデータを取得できませんでした。')
raise NoContentsException('No chat data.')
cont = contents['liveChatContinuation']['continuations'][0]
if cont is None:
raise NoContinuationsException('Continuationがありません。')
raise NoContinuationsException('No Continuation')
metadata = cont.get('liveChatReplayContinuationData')
if metadata:
continuation = metadata.get("continuation")

View File

@@ -0,0 +1,54 @@
from . import parser
from . block import Block
from typing import NamedTuple
class Patch(NamedTuple):
"""
Patch represents chunk of chat data
which is fetched by asyncdl.download_patch._fetch().
"""
chats : list = []
continuation : str = None
first : int = None
last : int = None
def fill(block:Block, patch:Patch):
block_end = block.end
if patch.last < block_end or block.is_last:
set_patch(block, patch)
return
for line in reversed(patch.chats):
line_offset = parser.get_offset(line)
if line_offset < block_end:
break
patch.chats.pop()
set_patch(block, patch._replace(
continuation = None,
last = line_offset
)
)
block.remaining=0
block.done=True
def split(parent_block:Block, child_block:Block, patch:Patch):
parent_block.during_split = False
"""patch overlaps with parent_block"""
if patch.first <= parent_block.last:
child_block.continuation = None
''' Leave child_block.during_split == True
to exclude from during_split sequence.'''
return
child_block.during_split = False
child_block.first=patch.first
parent_block.end =patch.first
fill(child_block, patch)
def set_patch(block:Block, patch:Patch):
block.continuation = patch.continuation
block.chat_data.extend(patch.chats)
block.last = patch.last
block.remaining = block.end-block.last

View File

@@ -1,19 +1,19 @@
import aiohttp
import asyncio
import json
from pytchat.tool import parser
from pytchat.tool.download import parser
import sys
import time
from aioresponses import aioresponses
from concurrent.futures import CancelledError
from pytchat.tool import asyncdl
from pytchat.tool.download import asyncdl
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def test_asyncdl_split(mocker):
def test_asyncdl_split():
ret = asyncdl._split(0,1000,1)
assert ret == [0]

View File

@@ -3,60 +3,73 @@ import asyncio
import json
import os, sys
import time
from aioresponses import aioresponses
from pytchat.tool import duplcheck
from pytchat.tool import parser
from pytchat.tool.block import Block
from pytchat.tool.duplcheck import _dump
from pytchat.tool.download import duplcheck
from pytchat.tool.download import parser
from pytchat.tool.download.block import Block
from pytchat.tool.download.duplcheck import _dump
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def load_chatdata(filename):
def test_overlap():
"""
test overlap data
operation : [0] [2] [3] [4] -> last :align to end
[1] , [5] -> no change
"""
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/overlap/"+filename))
)[1]
blocks = (
Block(first = 0, last= 12771, end= 9890,chat_data = load_chatdata("dp0-0.json")),
Block(first = 9890, last= 15800, end= 20244,chat_data = load_chatdata("dp0-1.json")),
Block(first = 20244,last= 45146, end= 32476,chat_data = load_chatdata("dp0-2.json")),
Block(first = 32476,last= 50520, end= 41380,chat_data = load_chatdata("dp0-3.json")),
Block(first = 41380,last= 62875, end= 52568,chat_data = load_chatdata("dp0-4.json")),
Block(first = 52568,last= 62875, end= 54000,chat_data = load_chatdata("dp0-5.json"),is_last=True)
)
result = duplcheck.remove_overlap(blocks)
#dp0-0.json has item offset time is 9890 (equals block[0].end = block[1].first),
#but must be aligne to the most close and smaller value:9779.
assert result[0].last == 9779
assert result[1].last == 15800
assert result[2].last == 32196
assert result[3].last == 41116
assert result[4].last == 52384
#the last block must be always added to result.
assert result[5].last == 62875
def test_duplicate_head():
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
)[1]
def test_overwrap(mocker):
"""
test overwrap data
operation : [0] , [1] -> discard [1]
[0] , [2] , [3] -> discard [2]
[3] , [4] , [5] -> discard [4]
result : [0] , [3] , [5]
"""
blocks = (
Block(first = 0,last= 38771, chat_data = load_chatdata("dp0-0.json")),
Block(first = 9890,last= 38771, chat_data = load_chatdata("dp0-1.json")),
Block(first = 20244,last= 45146, chat_data = load_chatdata("dp0-2.json")),
Block(first = 32476,last= 60520, chat_data = load_chatdata("dp0-3.json")),
Block(first = 41380,last= 62875, chat_data = load_chatdata("dp0-4.json")),
Block(first = 52568,last= 62875, chat_data = load_chatdata("dp0-5.json"))
)
result = duplcheck.overwrap(blocks)
assert len(result) == 3
assert result[0].first == blocks[0].first
assert result[0].last == blocks[0].last
assert result[1].first == blocks[3].first
assert result[1].last == blocks[3].last
assert result[2].first == blocks[5].first
assert result[2].last == blocks[5].last
def test_duplicate_head(mocker):
"""
test duplicate head data
operation : [0] , [1] -> discard [0]
[1] , [2] -> discard [1]
[2] , [3] -> append [2]
[2] , [3] -> append [2]
[3] , [4] -> discard [3]
[4] , [5] -> append [4]
[4] , [5] -> append [4]
append [5]
result : [0] , [3] , [5]
"""
#chat data offsets are ignored.
blocks = (
Block(first = 0, last = 2500, chat_data = load_chatdata("dp0-0.json")),
Block(first = 0, last =38771, chat_data = load_chatdata("dp0-1.json")),
@@ -66,7 +79,7 @@ def test_duplicate_head(mocker):
Block(first = 52568, last =62875, chat_data = load_chatdata("dp0-5.json"))
)
_dump(blocks)
result = duplcheck.duplicate_head(blocks)
result = duplcheck.remove_duplicate_head(blocks)
assert len(result) == 3
assert result[0].first == blocks[2].first
@@ -76,19 +89,23 @@ def test_duplicate_head(mocker):
assert result[2].first == blocks[5].first
assert result[2].last == blocks[5].last
def test_duplicate_tail(mocker):
def test_duplicate_tail():
"""
test duplicate tail data
operation : append [0]
[0] , [1] -> discard [1]
[1] , [2] -> append [2]
[1] , [2] -> append [2]
[2] , [3] -> discard [3]
[3] , [4] -> append [4]
[3] , [4] -> append [4]
[4] , [5] -> discard [5]
result : [0] , [2] , [4]
"""
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
)[1]
#chat data offsets are ignored.
blocks = (
Block(first = 0,last = 2500, chat_data=load_chatdata("dp0-0.json")),
Block(first = 1500,last = 2500, chat_data=load_chatdata("dp0-1.json")),
@@ -98,7 +115,7 @@ def test_duplicate_tail(mocker):
Block(first = 52568,last = 62875, chat_data=load_chatdata("dp0-5.json"))
)
result = duplcheck.duplicate_tail(blocks)
result = duplcheck.remove_duplicate_tail(blocks)
_dump(result)
assert len(result) == 3
assert result[0].first == blocks[0].first

232
tests/test_patch.py Normal file
View File

@@ -0,0 +1,232 @@
import aiohttp
import asyncio
import json
import os, sys
import time
from aioresponses import aioresponses
from pytchat.tool.download import duplcheck
from pytchat.tool.download import parser
from pytchat.tool.download.block import Block
from pytchat.tool.download.patch import Patch, fill, split, set_patch
from pytchat.tool.download.duplcheck import _dump
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_patch/"+filename))
)[1]
def test_split_0():
"""
Normal case
@parent_block (# = already downloaded)
first last end
|########----------------------------------------|
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
|
|
V
@parent_block
first last end (after split)
|########------------|
@child_block
first last end
|###########---------------|
@fetched patch
|-- patch --|
"""
parent = Block(first=0, last=4000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert child.continuation == 'patch'
assert parent.last < child.first
assert parent.end == child.first
assert child.first < child.last
assert child.last < child.end
assert parent.during_split == False
assert child.during_split == False
def test_split_1():
"""patch.first <= parent_block.last
While awaiting at run()->asyncdl._fetch()
downloading parent_block proceeds,
and parent.block.last exceeds patch.first.
In this case, fetched patch is all discarded,
and dlworker searches other processing block again.
~~~~~~ before ~~~~~~
patch.first
first | last end
|####################|#####|---------------------|
^
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end
|###########################|--------------------|
@child_block
.............. ->  discard all data
"""
parent = Block(first=0, last=33000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert parent.last == 33000 #no change
assert parent.end == 60000 #no change
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == True #exclude during_split sequence
def test_split_2():
"""child_block.end < patch.last:
Case the last offset of patch exceeds child_block.end.
In this case, remove overlapped data of patch.
~~~~~~ before ~~~~~~
@parent_block (# = already downloaded)
first last end (before split)
|########------------------------------|
@child_block
first = last = 0 end=parent_end
-----------------------------------------|
continuation:succeed from patch
@fetched patch
|-------- patch --------|
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end (after split)
|########------------|
@child_block old patch.end
first last=end |
|#################|...... cut extra data.
^
continuation : None (download complete)
@fetched patch
|-------- patch --------|
"""
parent = Block(first=0, last=4000, end=33500, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=33500, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert child.continuation is None
assert parent.last < child.first
assert parent.end == child.first
assert child.first < child.last
assert child.last < child.end
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == False
def test_split_none():
"""patch.last <= parent_block.last
While awaiting at run()->asyncdl._fetch()
downloading parent_block proceeds,
and parent.block.last exceeds patch.first.
In this case, fetched patch is all discarded,
and dlworker searches other processing block again.
~~~~~~ before ~~~~~~
patch.first
first | last end
|####################|###################|-------|
^
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
patch.last < parent_block.last .
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end (before split)
|########################################|-------|
.
@child_block
............ -> discard all data.
"""
parent = Block(first=0, last=40000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert parent.last == 40000 #no change
assert parent.end == 60000 #no change
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == True #exclude during_split sequence

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-0.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-1.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-3.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-4.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-5.json vendored Normal file

File diff suppressed because it is too large Load Diff