Merge branch 'feature/downloader' into develop

This commit is contained in:
taizan-hokuto
2020-02-16 21:38:19 +09:00
34 changed files with 55145 additions and 2 deletions

View File

@@ -1,5 +1,5 @@
include requirements.txt
include requirements_test.txt
include README.MD
include README.md
global-exclude tests/*
global-exclude pytchat/testrun*.py

1
error.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -4,7 +4,7 @@ from . import mylogger
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
def logger(module_name: str, loglevel = None):
def logger(module_name: str, loglevel = logging.DEBUG):
module_logger = mylogger.get_logger(module_name, loglevel = loglevel)
return module_logger

View File

@@ -41,3 +41,6 @@ class IllegalFunctionCall(Exception):
get()を呼び出した場合の例外
'''
pass
class InvalidVideoIdException(Exception):
pass

0
pytchat/tool/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,132 @@
import aiohttp
import asyncio
import json
from . import parser
from . block import Block
from . dlworker import DownloadWorker
from . patch import Patch
from ... import config
from ... paramgen import arcparam
from concurrent.futures import CancelledError
from urllib.parse import quote
headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
def _split(start, end, count, min_interval_sec = 120):
"""
Split section from `start` to `end` into `count` pieces,
and returns the beginning of each piece.
The `count` is adjusted so that the length of each piece
is no smaller than `min_interval`.
Returns:
--------
List of the offset of each block's first chat data.
"""
if not (isinstance(start,int) or isinstance(start,float)) or \
not (isinstance(end,int) or isinstance(end,float)):
raise ValueError("start/end must be int or float")
if not isinstance(count,int):
raise ValueError("count must be int")
if start>end:
raise ValueError("end must be equal to or greater than start.")
if count<1:
raise ValueError("count must be equal to or greater than 1.")
if (end-start)/count < min_interval_sec:
count = int((end-start)/min_interval_sec)
if count == 0 : count = 1
interval= (end-start)/count
if count == 1:
return [start]
return sorted( list(set( [int(start + interval*j)
for j in range(count) ])))
def ready_blocks(video_id, duration, div, callback):
if div <= 0: raise ValueError
async def _get_blocks( video_id, duration, div, callback):
async with aiohttp.ClientSession() as session:
tasks = [_create_block(session, video_id, seektime, callback)
for seektime in _split(-1, duration, div)]
return await asyncio.gather(*tasks)
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime = seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
async with session.get(url, headers = headers) as resp:
text = await resp.text()
next_continuation, actions = parser.parse(json.loads(text))
if actions:
first = parser.get_offset(actions[0])
last = parser.get_offset(actions[-1])
if callback:
callback(actions,last-first)
return Block(
continuation = next_continuation,
chat_data = actions,
first = first,
last = last
)
"""
fetch initial blocks.
"""
loop = asyncio.get_event_loop()
blocks = loop.run_until_complete(
_get_blocks(video_id, duration, div, callback))
return blocks
def download_patch(callback, blocks, video_id):
async def _allocate_workers():
workers = [
DownloadWorker(
fetch = _fetch, block = block,
blocks = blocks, video_id = video_id
)
for block in blocks
]
async with aiohttp.ClientSession() as session:
tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks)
async def _fetch(continuation,session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
async with session.get(url,headers = config.headers) as resp:
chat_json = await resp.text()
continuation, actions = parser.parse(json.loads(chat_json))
if actions:
last = parser.get_offset(actions[-1])
first = parser.get_offset(actions[0])
if callback:
callback(actions, last - first)
return Patch(actions, continuation, first, last)
return Patch()
"""
allocate workers and assign blocks.
"""
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(_allocate_workers())
except CancelledError:
pass
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
def cancel():
loop = asyncio.get_event_loop()
loop.create_task(_shutdown())

View File

@@ -0,0 +1,57 @@
from . import parser
class Block:
"""Block object represents something like a box
to join chunk of chatdata.
Parameter:
---------
first : int :
videoOffsetTimeMs of the first chat_data
(chat_data[0])
last : int :
videoOffsetTimeMs of the last chat_data.
(chat_data[-1])
this value increases as fetching chatdata progresses.
end : int :
target videoOffsetTimeMs of last chat data for download,
equals to first videoOffsetTimeMs of next block.
when download worker reaches this offset, stop downloading.
continuation : str :
continuation param of last chat data.
chat_data : list
done : bool :
whether this block has been downloaded.
remaining : int :
remaining data to download.
equals end - last.
is_last : bool :
whether this block is the last one in blocklist.
during_split : bool :
whether this block is in the process of during_split.
while True, this block is excluded from duplicate split procedure.
"""
__slots__ = ['first','last','end','continuation','chat_data','remaining',
'done','is_last','during_split']
def __init__(self, first = 0, last = 0, end = 0,
continuation = '', chat_data = [], is_last = False,
during_split = False):
self.first = first
self.last = last
self.end = end
self.continuation = continuation
self.chat_data = chat_data
self.done = False
self.remaining = self.end - self.last
self.is_last = is_last
self.during_split = during_split

View File

@@ -0,0 +1,87 @@
from . import parser
from . block import Block
from . patch import Patch, fill, split
from ... paramgen import arcparam
class DownloadWorker:
"""
DownloadWorker associates a download session with a block.
When the dlworker finishes downloading, the block
being downloaded is splitted and assigned the free dlworker.
Parameter
----------
fetch : func :
download function of asyncdl
block : Block :
Block object that includes chat_data
blocks : list :
List of Block(s)
video_id : str :
parent_block : Block :
the block from which current block is splitted
"""
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
def __init__(self, fetch, block, blocks, video_id ):
self.block = block
self.fetch = fetch
self.blocks = blocks
self.video_id = video_id
self.parent_block = None
async def run(self, session):
while self.block.continuation:
patch = await self.fetch(
self.block.continuation, session)
if patch.continuation is None:
"""TODO : make the dlworker assigned to the last block
to work more than twice as possible.
"""
break
if self.parent_block:
split(self.parent_block, self.block, patch)
self.parent_block = None
else:
fill(self.block, patch)
if self.block.continuation is None:
"""finished downloading this block """
self.block.done = True
self.block = _search_new_block(self)
def _search_new_block(worker) -> Block:
index, undone_block = _get_undone_block(worker.blocks)
if undone_block is None:
return Block(continuation = None)
mean = (undone_block.last + undone_block.end)/2
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
worker.parent_block = undone_block
worker.parent_block.during_split = True
new_block = Block(
end = undone_block.end,
chat_data = [],
continuation = continuation,
during_split = True,
is_last = worker.parent_block.is_last)
worker.blocks.insert(index+1, new_block)
return new_block
def _get_undone_block(blocks) -> (int, Block):
min_interval_ms = 120000
max_remaining = 0
undone_block = None
index_undone_block = 0
for index, block in enumerate(blocks):
if block.done or block.during_split:
continue
remaining = block.remaining
if remaining > max_remaining and remaining > min_interval_ms:
index_undone_block = index
undone_block = block
max_remaining = remaining
return index_undone_block, undone_block

View File

@@ -0,0 +1,89 @@
from . import asyncdl
from . import duplcheck
from . import parser
from .. videoinfo import VideoInfo
from ... import config
from ... exceptions import InvalidVideoIdException
logger = config.logger(__name__)
headers=config.headers
class Downloader:
def __init__(self, video_id, duration, div, callback):
if not isinstance(div ,int) or div < 1:
raise ValueError('div must be positive integer.')
elif div > 10:
div = 10
if not isinstance(duration ,int) or duration < 1:
raise ValueError('duration must be positive integer.')
self.video_id = video_id
self.duration = duration
self.div = div
self.callback = callback
self.blocks = []
def _ready_blocks(self):
blocks = asyncdl.ready_blocks(
self.video_id, self.duration, self.div, self.callback)
self.blocks = [block for block in blocks if block]
return self
def _remove_duplicate_head(self):
self.blocks = duplcheck.remove_duplicate_head(self.blocks)
return self
def _set_block_end(self):
for i in range(len(self.blocks)-1):
self.blocks[i].end = self.blocks[i+1].first
self.blocks[-1].end = self.duration*1000
self.blocks[-1].is_last =True
return self
def _remove_overlap(self):
self.blocks = duplcheck.remove_overlap(self.blocks)
return self
def _download_blocks(self):
asyncdl.download_patch(self.callback, self.blocks, self.video_id)
return self
def _remove_duplicate_tail(self):
self.blocks = duplcheck.remove_duplicate_tail(self.blocks)
return self
def _combine(self):
ret = []
for block in self.blocks:
ret.extend(block.chat_data)
return ret
def download(self):
return (
self._ready_blocks()
._remove_duplicate_head()
._set_block_end()
._remove_overlap()
._download_blocks()
._remove_duplicate_tail()
._combine()
)
def download(video_id, div = 1, callback = None, processor = None):
duration = 0
try:
duration = VideoInfo(video_id).get("duration")
except InvalidVideoIdException:
raise
if duration == 0:
print("video is live.")
return []
data = Downloader(video_id, duration, div, callback).download()
if processor is None:
return data
return processor.process(
[{'video_id':None,'timeout':1,'chatdata' : [action
["replayChatItemAction"]["actions"][0] for action in data]}]
)
def cancel():
asyncdl.cancel()

View File

@@ -0,0 +1,153 @@
from . import parser
def check_duplicate(chatdata):
max_range = len(chatdata)-1
tbl_offset = [None] * max_range
tbl_id = [None] * max_range
tbl_type = [None] * max_range
def create_table(chatdata, max_range):
for i in range(max_range):
tbl_offset[i] = parser.get_offset(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_type[i] = parser.get_type(chatdata[i])
def is_duplicate(i, j):
return (
tbl_offset[i] == tbl_offset[j]
and
tbl_id[i] == tbl_id[j]
and
tbl_type[i] == tbl_type[j]
)
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate data...")
return [{ "i":{
"index" : i, "id" : parser.get_id(chatdata[i]),
"offsetTime" : parser.get_offset(chatdata[i]),
"type" : parser.get_type(chatdata[i])
},
"j":{
"index" : j, "id" : parser.get_id(chatdata[j]),
"offsetTime" : parser.get_offset(chatdata[j]),
"type" : parser.get_type(chatdata[j])
}
}
for i in range(max_range) for j in range(i+1,max_range)
if is_duplicate(i,j)]
def check_duplicate_offset(chatdata):
max_range = len(chatdata)
tbl_offset = [None] * max_range
tbl_id = [None] * max_range
tbl_type = [None] * max_range
def create_table(chatdata, max_range):
for i in range(max_range):
tbl_offset[i] = parser.get_offset(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_type[i] = parser.get_type(chatdata[i])
def is_duplicate(i, j):
return (
tbl_offset[i] == tbl_offset[j]
and
tbl_id[i] == tbl_id[j]
)
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate data...")
return [{
"index" : i, "id" : tbl_id[i],
"offsetTime" : tbl_offset[i],
"type:" : tbl_type[i]
}
for i in range(max_range-1)
if is_duplicate(i,i+1)]
def remove_duplicate_head(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_head(index):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index+1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index].chat_data[0])
id_1 = parser.get_id(blocks[index+1].chat_data[0])
type_0 = parser.get_type(blocks[index].chat_data[0])
type_1 = parser.get_type(blocks[index+1].chat_data[0])
return (
blocks[index].first == blocks[index+1].first
and
id_0 == id_1
and
type_0 == type_1
)
ret = [blocks[i] for i in range(len(blocks)-1)
if (len(blocks[i].chat_data)>0 and
not is_duplicate_head(i) )]
ret.append(blocks[-1])
return ret
def remove_duplicate_tail(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_tail(index):
if len(blocks[index].chat_data) == 0:
return True
elif len(blocks[index-1].chat_data) == 0:
return False
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
type_1 = parser.get_type(blocks[index].chat_data[-1])
return (
blocks[index-1].last == blocks[index].last
and
id_0 == id_1
and
type_0 == type_1
)
ret = [blocks[i] for i in range(0,len(blocks))
if i == 0 or not is_duplicate_tail(i) ]
return ret
def remove_overlap(blocks):
"""
Fix overlapped blocks after ready_blocks().
Align the last offset of each block to the first offset
of next block (equals `end` offset of each block).
"""
if len(blocks) == 1 : return blocks
for block in blocks:
if block.is_last:
break
if len(block.chat_data)==0:
continue
block_end = block.end
if block.last >= block_end:
for line in reversed(block.chat_data):
if parser.get_offset(line) < block_end:
break
block.chat_data.pop()
block.last = parser.get_offset(line)
block.remaining=0
block.done=True
block.continuation = None
return blocks
def _dump(blocks):
print(f"---------- first last end---")
for i,block in enumerate(blocks):
print(f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")

View File

@@ -0,0 +1,54 @@
import json
from ... import config
from ... exceptions import (
ResponseContextError,
NoContentsException,
NoContinuationsException )
logger = config.logger(__name__)
def parse(jsn):
"""
Parse replay chat data.
Parameter:
----------
jsn : dict
JSON of replay chat data.
Returns:
------
continuation : str
actions : list
"""
if jsn is None:
raise ValueError("parameter JSON is None")
if jsn['response']['responseContext'].get('errors'):
raise ResponseContextError(
'video_id is invalid or private/deleted.')
contents=jsn['response'].get('continuationContents')
if contents is None:
raise NoContentsException('No chat data.')
cont = contents['liveChatContinuation']['continuations'][0]
if cont is None:
raise NoContinuationsException('No Continuation')
metadata = cont.get('liveChatReplayContinuationData')
if metadata:
continuation = metadata.get("continuation")
actions = contents['liveChatContinuation'].get('actions')
return continuation, actions
return None, []
def get_offset(item):
return int(item['replayChatItemAction']["videoOffsetTimeMsec"])
def get_id(item):
return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].values())[0].get('id')
def get_type(item):
return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].keys())[0]

View File

@@ -0,0 +1,54 @@
from . import parser
from . block import Block
from typing import NamedTuple
class Patch(NamedTuple):
"""
Patch represents chunk of chat data
which is fetched by asyncdl.download_patch._fetch().
"""
chats : list = []
continuation : str = None
first : int = None
last : int = None
def fill(block:Block, patch:Patch):
block_end = block.end
if patch.last < block_end or block.is_last:
set_patch(block, patch)
return
for line in reversed(patch.chats):
line_offset = parser.get_offset(line)
if line_offset < block_end:
break
patch.chats.pop()
set_patch(block, patch._replace(
continuation = None,
last = line_offset
)
)
block.remaining=0
block.done=True
def split(parent_block:Block, child_block:Block, patch:Patch):
parent_block.during_split = False
"""patch overlaps with parent_block"""
if patch.first <= parent_block.last:
child_block.continuation = None
''' Leave child_block.during_split == True
to exclude from during_split sequence.'''
return
child_block.during_split = False
child_block.first=patch.first
parent_block.end =patch.first
fill(child_block, patch)
def set_patch(block:Block, patch:Patch):
block.continuation = patch.continuation
block.chat_data.extend(patch.chats)
block.last = patch.last
block.remaining = block.end-block.last

42
pytchat/tool/videoinfo.py Normal file
View File

@@ -0,0 +1,42 @@
import json
import re
import requests
from .. import config
from .. import util
from ..exceptions import InvalidVideoIdException
headers = config.headers
pattern=re.compile(r"yt\.setConfig\({'PLAYER_CONFIG': ({.*})}\);")
class VideoInfo:
def __init__(self,video_id):
self.video_id = video_id
self.info = self._get_info(video_id)
def _get_info(self,video_id):
url = f"https://www.youtube.com/embed/{video_id}"
resp= requests.get(url, headers = headers)
resp.raise_for_status()
return self._parse(resp.text)
def _parse(self,html):
result = re.search(pattern, html)
res= json.loads(result.group(1))
response = res["args"].get("embedded_player_response")
if response is None:
raise InvalidVideoIdException("動画IDが無効です。")
renderer = (json.loads(response))["embedPreview"]["thumbnailPreviewRenderer"]
return {
"duration": int(renderer["videoDurationSeconds"]) if renderer.get("videoDurationSeconds") else 0,
"title" : [''.join(run["text"]) for run in renderer["title"]["runs"]][0] if renderer.get("title") else None,
"channelId" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["channelThumbnailEndpoint"]["channelThumbnailEndpoint"]["urlEndpoint"]["urlEndpoint"]["url"][9:] if renderer.get("videoDetails") else None,
"authorProfileImage" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["channelThumbnail"]["thumbnails"][0]["url"] if renderer.get("videoDetails") else None,
"thumbnail" : renderer["defaultThumbnail"]["thumbnails"][2]["url"] if renderer.get("defaultThumbnail") else None,
"channelName" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["expandedRenderer"]["embeddedPlayerOverlayVideoDetailsExpandedRenderer"]["title"]["runs"][0]["text"] if renderer.get("videoDetails") else None,
"movingThumbnail" : renderer["movingThumbnail"]["thumbnails"][0]["url"] if renderer.get("movingThumbnail") else None
}
def get(self,item):
return self.info.get(item)

77
tests/test_dl_asyncdl.py Normal file
View File

@@ -0,0 +1,77 @@
import aiohttp
import asyncio
import json
from pytchat.tool.download import parser
import sys
import time
from aioresponses import aioresponses
from concurrent.futures import CancelledError
from pytchat.tool.download import asyncdl
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def test_asyncdl_split():
ret = asyncdl._split(0,1000,1)
assert ret == [0]
ret = asyncdl._split(1000,1000,10)
assert ret == [1000]
ret = asyncdl._split(0,1000,5)
assert ret == [0,200,400,600,800]
ret = asyncdl._split(10.5, 700.3, 5)
assert ret == [10, 148, 286, 424, 562]
ret = asyncdl._split(0,500,5)
assert ret == [0,125,250,375]
ret = asyncdl._split(0,500,500)
assert ret == [0,125,250,375]
ret = asyncdl._split(-1,1000,5)
assert ret == [-1, 199, 399, 599, 799]
"""invalid argument order"""
try:
ret = asyncdl._split(500,0,5)
assert False
except ValueError:
assert True
"""invalid count"""
try:
ret = asyncdl._split(0,500,-1)
assert False
except ValueError:
assert True
try:
ret = asyncdl._split(0,500,0)
assert False
except ValueError:
assert True
"""invalid argument type"""
try:
ret = asyncdl._split(0,5000,5.2)
assert False
except ValueError:
assert True
try:
ret = asyncdl._split(0,5000,"test")
assert False
except ValueError:
assert True
try:
ret = asyncdl._split([0,1],5000,5)
assert False
except ValueError:
assert True

128
tests/test_dl_duplcheck.py Normal file
View File

@@ -0,0 +1,128 @@
import aiohttp
import asyncio
import json
import os, sys
import time
from pytchat.tool.download import duplcheck
from pytchat.tool.download import parser
from pytchat.tool.download.block import Block
from pytchat.tool.download.duplcheck import _dump
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def test_overlap():
"""
test overlap data
operation : [0] [2] [3] [4] -> last :align to end
[1] , [5] -> no change
"""
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/overlap/"+filename))
)[1]
blocks = (
Block(first = 0, last= 12771, end= 9890,chat_data = load_chatdata("dp0-0.json")),
Block(first = 9890, last= 15800, end= 20244,chat_data = load_chatdata("dp0-1.json")),
Block(first = 20244,last= 45146, end= 32476,chat_data = load_chatdata("dp0-2.json")),
Block(first = 32476,last= 50520, end= 41380,chat_data = load_chatdata("dp0-3.json")),
Block(first = 41380,last= 62875, end= 52568,chat_data = load_chatdata("dp0-4.json")),
Block(first = 52568,last= 62875, end= 54000,chat_data = load_chatdata("dp0-5.json"),is_last=True)
)
result = duplcheck.remove_overlap(blocks)
#dp0-0.json has item offset time is 9890 (equals block[0].end = block[1].first),
#but must be aligne to the most close and smaller value:9779.
assert result[0].last == 9779
assert result[1].last == 15800
assert result[2].last == 32196
assert result[3].last == 41116
assert result[4].last == 52384
#the last block must be always added to result.
assert result[5].last == 62875
def test_duplicate_head():
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
)[1]
"""
test duplicate head data
operation : [0] , [1] -> discard [0]
[1] , [2] -> discard [1]
[2] , [3] -> append [2]
[3] , [4] -> discard [3]
[4] , [5] -> append [4]
append [5]
result : [0] , [3] , [5]
"""
#chat data offsets are ignored.
blocks = (
Block(first = 0, last = 2500, chat_data = load_chatdata("dp0-0.json")),
Block(first = 0, last =38771, chat_data = load_chatdata("dp0-1.json")),
Block(first = 0, last =45146, chat_data = load_chatdata("dp0-2.json")),
Block(first = 20244, last =60520, chat_data = load_chatdata("dp0-3.json")),
Block(first = 20244, last =62875, chat_data = load_chatdata("dp0-4.json")),
Block(first = 52568, last =62875, chat_data = load_chatdata("dp0-5.json"))
)
_dump(blocks)
result = duplcheck.remove_duplicate_head(blocks)
assert len(result) == 3
assert result[0].first == blocks[2].first
assert result[0].last == blocks[2].last
assert result[1].first == blocks[4].first
assert result[1].last == blocks[4].last
assert result[2].first == blocks[5].first
assert result[2].last == blocks[5].last
def test_duplicate_tail():
"""
test duplicate tail data
operation : append [0]
[0] , [1] -> discard [1]
[1] , [2] -> append [2]
[2] , [3] -> discard [3]
[3] , [4] -> append [4]
[4] , [5] -> discard [5]
result : [0] , [2] , [4]
"""
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
)[1]
#chat data offsets are ignored.
blocks = (
Block(first = 0,last = 2500, chat_data=load_chatdata("dp0-0.json")),
Block(first = 1500,last = 2500, chat_data=load_chatdata("dp0-1.json")),
Block(first = 10000,last = 45146, chat_data=load_chatdata("dp0-2.json")),
Block(first = 20244,last = 45146, chat_data=load_chatdata("dp0-3.json")),
Block(first = 20244,last = 62875, chat_data=load_chatdata("dp0-4.json")),
Block(first = 52568,last = 62875, chat_data=load_chatdata("dp0-5.json"))
)
result = duplcheck.remove_duplicate_tail(blocks)
_dump(result)
assert len(result) == 3
assert result[0].first == blocks[0].first
assert result[0].last == blocks[0].last
assert result[1].first == blocks[2].first
assert result[1].last == blocks[2].last
assert result[2].first == blocks[4].first
assert result[2].last == blocks[4].last

232
tests/test_patch.py Normal file
View File

@@ -0,0 +1,232 @@
import aiohttp
import asyncio
import json
import os, sys
import time
from aioresponses import aioresponses
from pytchat.tool.download import duplcheck
from pytchat.tool.download import parser
from pytchat.tool.download.block import Block
from pytchat.tool.download.patch import Patch, fill, split, set_patch
from pytchat.tool.download.duplcheck import _dump
def _open_file(path):
with open(path,mode ='r',encoding = 'utf-8') as f:
return f.read()
def load_chatdata(filename):
return parser.parse(
json.loads(_open_file("tests/testdata/dl_patch/"+filename))
)[1]
def test_split_0():
"""
Normal case
@parent_block (# = already downloaded)
first last end
|########----------------------------------------|
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
|
|
V
@parent_block
first last end (after split)
|########------------|
@child_block
first last end
|###########---------------|
@fetched patch
|-- patch --|
"""
parent = Block(first=0, last=4000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert child.continuation == 'patch'
assert parent.last < child.first
assert parent.end == child.first
assert child.first < child.last
assert child.last < child.end
assert parent.during_split == False
assert child.during_split == False
def test_split_1():
"""patch.first <= parent_block.last
While awaiting at run()->asyncdl._fetch()
downloading parent_block proceeds,
and parent.block.last exceeds patch.first.
In this case, fetched patch is all discarded,
and dlworker searches other processing block again.
~~~~~~ before ~~~~~~
patch.first
first | last end
|####################|#####|---------------------|
^
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end
|###########################|--------------------|
@child_block
.............. ->  discard all data
"""
parent = Block(first=0, last=33000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert parent.last == 33000 #no change
assert parent.end == 60000 #no change
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == True #exclude during_split sequence
def test_split_2():
"""child_block.end < patch.last:
Case the last offset of patch exceeds child_block.end.
In this case, remove overlapped data of patch.
~~~~~~ before ~~~~~~
@parent_block (# = already downloaded)
first last end (before split)
|########------------------------------|
@child_block
first = last = 0 end=parent_end
-----------------------------------------|
continuation:succeed from patch
@fetched patch
|-------- patch --------|
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end (after split)
|########------------|
@child_block old patch.end
first last=end |
|#################|...... cut extra data.
^
continuation : None (download complete)
@fetched patch
|-------- patch --------|
"""
parent = Block(first=0, last=4000, end=33500, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=33500, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert child.continuation is None
assert parent.last < child.first
assert parent.end == child.first
assert child.first < child.last
assert child.last < child.end
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == False
def test_split_none():
"""patch.last <= parent_block.last
While awaiting at run()->asyncdl._fetch()
downloading parent_block proceeds,
and parent.block.last exceeds patch.first.
In this case, fetched patch is all discarded,
and dlworker searches other processing block again.
~~~~~~ before ~~~~~~
patch.first
first | last end
|####################|###################|-------|
^
@child_block
first = last = 0 end=parent_end
---------------------------------------------------|
@fetched patch
|-- patch --|
patch.last < parent_block.last .
|
|
V
~~~~~~ after ~~~~~~
@parent_block
first last end (before split)
|########################################|-------|
.
@child_block
............ -> discard all data.
"""
parent = Block(first=0, last=40000, end=60000, continuation='parent', during_split=True)
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
patch = Patch(chats=load_chatdata('pt0-5.json'),
first=32500, last=34000, continuation='patch')
split(parent,child,patch)
assert parent.last == 40000 #no change
assert parent.end == 60000 #no change
assert child.continuation is None
assert parent.during_split == False
assert child.during_split == True #exclude during_split sequence

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-0.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-1.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-3.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-4.json vendored Normal file

File diff suppressed because it is too large Load Diff

3078
tests/testdata/dl_patch/pt0-5.json vendored Normal file

File diff suppressed because it is too large Load Diff