Merge branch 'feature/downloader' into develop
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
include requirements.txt
|
||||
include requirements_test.txt
|
||||
include README.MD
|
||||
include README.md
|
||||
global-exclude tests/*
|
||||
global-exclude pytchat/testrun*.py
|
||||
1
error.json
Normal file
1
error.json
Normal file
File diff suppressed because one or more lines are too long
@@ -4,7 +4,7 @@ from . import mylogger
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
|
||||
|
||||
def logger(module_name: str, loglevel = None):
|
||||
def logger(module_name: str, loglevel = logging.DEBUG):
|
||||
module_logger = mylogger.get_logger(module_name, loglevel = loglevel)
|
||||
return module_logger
|
||||
|
||||
|
||||
@@ -41,3 +41,6 @@ class IllegalFunctionCall(Exception):
|
||||
get()を呼び出した場合の例外
|
||||
'''
|
||||
pass
|
||||
|
||||
class InvalidVideoIdException(Exception):
|
||||
pass
|
||||
|
||||
0
pytchat/tool/__init__.py
Normal file
0
pytchat/tool/__init__.py
Normal file
0
pytchat/tool/download/__init__.py
Normal file
0
pytchat/tool/download/__init__.py
Normal file
132
pytchat/tool/download/asyncdl.py
Normal file
132
pytchat/tool/download/asyncdl.py
Normal file
@@ -0,0 +1,132 @@
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from . dlworker import DownloadWorker
|
||||
from . patch import Patch
|
||||
from ... import config
|
||||
from ... paramgen import arcparam
|
||||
from concurrent.futures import CancelledError
|
||||
from urllib.parse import quote
|
||||
|
||||
headers = config.headers
|
||||
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
|
||||
"get_live_chat_replay?continuation="
|
||||
|
||||
def _split(start, end, count, min_interval_sec = 120):
|
||||
"""
|
||||
Split section from `start` to `end` into `count` pieces,
|
||||
and returns the beginning of each piece.
|
||||
The `count` is adjusted so that the length of each piece
|
||||
is no smaller than `min_interval`.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
List of the offset of each block's first chat data.
|
||||
"""
|
||||
|
||||
if not (isinstance(start,int) or isinstance(start,float)) or \
|
||||
not (isinstance(end,int) or isinstance(end,float)):
|
||||
raise ValueError("start/end must be int or float")
|
||||
if not isinstance(count,int):
|
||||
raise ValueError("count must be int")
|
||||
if start>end:
|
||||
raise ValueError("end must be equal to or greater than start.")
|
||||
if count<1:
|
||||
raise ValueError("count must be equal to or greater than 1.")
|
||||
if (end-start)/count < min_interval_sec:
|
||||
count = int((end-start)/min_interval_sec)
|
||||
if count == 0 : count = 1
|
||||
interval= (end-start)/count
|
||||
|
||||
if count == 1:
|
||||
return [start]
|
||||
return sorted( list(set( [int(start + interval*j)
|
||||
for j in range(count) ])))
|
||||
|
||||
def ready_blocks(video_id, duration, div, callback):
|
||||
if div <= 0: raise ValueError
|
||||
|
||||
async def _get_blocks( video_id, duration, div, callback):
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [_create_block(session, video_id, seektime, callback)
|
||||
for seektime in _split(-1, duration, div)]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
async def _create_block(session, video_id, seektime, callback):
|
||||
continuation = arcparam.getparam(video_id, seektime = seektime)
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
async with session.get(url, headers = headers) as resp:
|
||||
text = await resp.text()
|
||||
next_continuation, actions = parser.parse(json.loads(text))
|
||||
if actions:
|
||||
first = parser.get_offset(actions[0])
|
||||
last = parser.get_offset(actions[-1])
|
||||
if callback:
|
||||
callback(actions,last-first)
|
||||
return Block(
|
||||
continuation = next_continuation,
|
||||
chat_data = actions,
|
||||
first = first,
|
||||
last = last
|
||||
)
|
||||
"""
|
||||
fetch initial blocks.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
blocks = loop.run_until_complete(
|
||||
_get_blocks(video_id, duration, div, callback))
|
||||
return blocks
|
||||
|
||||
def download_patch(callback, blocks, video_id):
|
||||
|
||||
async def _allocate_workers():
|
||||
workers = [
|
||||
DownloadWorker(
|
||||
fetch = _fetch, block = block,
|
||||
blocks = blocks, video_id = video_id
|
||||
)
|
||||
for block in blocks
|
||||
]
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [worker.run(session) for worker in workers]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
async def _fetch(continuation,session) -> Patch:
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
async with session.get(url,headers = config.headers) as resp:
|
||||
chat_json = await resp.text()
|
||||
continuation, actions = parser.parse(json.loads(chat_json))
|
||||
if actions:
|
||||
last = parser.get_offset(actions[-1])
|
||||
first = parser.get_offset(actions[0])
|
||||
if callback:
|
||||
callback(actions, last - first)
|
||||
return Patch(actions, continuation, first, last)
|
||||
return Patch()
|
||||
"""
|
||||
allocate workers and assign blocks.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
loop.run_until_complete(_allocate_workers())
|
||||
except CancelledError:
|
||||
pass
|
||||
|
||||
async def _shutdown():
|
||||
print("\nshutdown...")
|
||||
tasks = [t for t in asyncio.all_tasks()
|
||||
if t is not asyncio.current_task()]
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
def cancel():
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(_shutdown())
|
||||
|
||||
57
pytchat/tool/download/block.py
Normal file
57
pytchat/tool/download/block.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from . import parser
|
||||
class Block:
|
||||
"""Block object represents something like a box
|
||||
to join chunk of chatdata.
|
||||
|
||||
Parameter:
|
||||
---------
|
||||
first : int :
|
||||
videoOffsetTimeMs of the first chat_data
|
||||
(chat_data[0])
|
||||
|
||||
last : int :
|
||||
videoOffsetTimeMs of the last chat_data.
|
||||
(chat_data[-1])
|
||||
|
||||
this value increases as fetching chatdata progresses.
|
||||
|
||||
end : int :
|
||||
target videoOffsetTimeMs of last chat data for download,
|
||||
equals to first videoOffsetTimeMs of next block.
|
||||
when download worker reaches this offset, stop downloading.
|
||||
|
||||
continuation : str :
|
||||
continuation param of last chat data.
|
||||
|
||||
chat_data : list
|
||||
|
||||
done : bool :
|
||||
whether this block has been downloaded.
|
||||
|
||||
remaining : int :
|
||||
remaining data to download.
|
||||
equals end - last.
|
||||
|
||||
is_last : bool :
|
||||
whether this block is the last one in blocklist.
|
||||
|
||||
during_split : bool :
|
||||
whether this block is in the process of during_split.
|
||||
while True, this block is excluded from duplicate split procedure.
|
||||
"""
|
||||
|
||||
__slots__ = ['first','last','end','continuation','chat_data','remaining',
|
||||
'done','is_last','during_split']
|
||||
|
||||
def __init__(self, first = 0, last = 0, end = 0,
|
||||
continuation = '', chat_data = [], is_last = False,
|
||||
during_split = False):
|
||||
self.first = first
|
||||
self.last = last
|
||||
self.end = end
|
||||
self.continuation = continuation
|
||||
self.chat_data = chat_data
|
||||
self.done = False
|
||||
self.remaining = self.end - self.last
|
||||
self.is_last = is_last
|
||||
self.during_split = during_split
|
||||
87
pytchat/tool/download/dlworker.py
Normal file
87
pytchat/tool/download/dlworker.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from . patch import Patch, fill, split
|
||||
from ... paramgen import arcparam
|
||||
|
||||
class DownloadWorker:
|
||||
"""
|
||||
DownloadWorker associates a download session with a block.
|
||||
|
||||
When the dlworker finishes downloading, the block
|
||||
being downloaded is splitted and assigned the free dlworker.
|
||||
|
||||
Parameter
|
||||
----------
|
||||
fetch : func :
|
||||
download function of asyncdl
|
||||
|
||||
block : Block :
|
||||
Block object that includes chat_data
|
||||
|
||||
blocks : list :
|
||||
List of Block(s)
|
||||
|
||||
video_id : str :
|
||||
|
||||
parent_block : Block :
|
||||
the block from which current block is splitted
|
||||
"""
|
||||
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
|
||||
|
||||
def __init__(self, fetch, block, blocks, video_id ):
|
||||
self.block = block
|
||||
self.fetch = fetch
|
||||
self.blocks = blocks
|
||||
self.video_id = video_id
|
||||
self.parent_block = None
|
||||
|
||||
async def run(self, session):
|
||||
while self.block.continuation:
|
||||
patch = await self.fetch(
|
||||
self.block.continuation, session)
|
||||
if patch.continuation is None:
|
||||
"""TODO : make the dlworker assigned to the last block
|
||||
to work more than twice as possible.
|
||||
"""
|
||||
break
|
||||
if self.parent_block:
|
||||
split(self.parent_block, self.block, patch)
|
||||
self.parent_block = None
|
||||
else:
|
||||
fill(self.block, patch)
|
||||
if self.block.continuation is None:
|
||||
"""finished downloading this block """
|
||||
self.block.done = True
|
||||
self.block = _search_new_block(self)
|
||||
|
||||
def _search_new_block(worker) -> Block:
|
||||
index, undone_block = _get_undone_block(worker.blocks)
|
||||
if undone_block is None:
|
||||
return Block(continuation = None)
|
||||
mean = (undone_block.last + undone_block.end)/2
|
||||
continuation = arcparam.getparam(worker.video_id, seektime = mean/1000)
|
||||
worker.parent_block = undone_block
|
||||
worker.parent_block.during_split = True
|
||||
new_block = Block(
|
||||
end = undone_block.end,
|
||||
chat_data = [],
|
||||
continuation = continuation,
|
||||
during_split = True,
|
||||
is_last = worker.parent_block.is_last)
|
||||
worker.blocks.insert(index+1, new_block)
|
||||
return new_block
|
||||
|
||||
def _get_undone_block(blocks) -> (int, Block):
|
||||
min_interval_ms = 120000
|
||||
max_remaining = 0
|
||||
undone_block = None
|
||||
index_undone_block = 0
|
||||
for index, block in enumerate(blocks):
|
||||
if block.done or block.during_split:
|
||||
continue
|
||||
remaining = block.remaining
|
||||
if remaining > max_remaining and remaining > min_interval_ms:
|
||||
index_undone_block = index
|
||||
undone_block = block
|
||||
max_remaining = remaining
|
||||
return index_undone_block, undone_block
|
||||
89
pytchat/tool/download/downloader.py
Normal file
89
pytchat/tool/download/downloader.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from . import asyncdl
|
||||
from . import duplcheck
|
||||
from . import parser
|
||||
from .. videoinfo import VideoInfo
|
||||
from ... import config
|
||||
from ... exceptions import InvalidVideoIdException
|
||||
|
||||
logger = config.logger(__name__)
|
||||
headers=config.headers
|
||||
|
||||
class Downloader:
|
||||
def __init__(self, video_id, duration, div, callback):
|
||||
if not isinstance(div ,int) or div < 1:
|
||||
raise ValueError('div must be positive integer.')
|
||||
elif div > 10:
|
||||
div = 10
|
||||
if not isinstance(duration ,int) or duration < 1:
|
||||
raise ValueError('duration must be positive integer.')
|
||||
self.video_id = video_id
|
||||
self.duration = duration
|
||||
self.div = div
|
||||
self.callback = callback
|
||||
self.blocks = []
|
||||
|
||||
def _ready_blocks(self):
|
||||
blocks = asyncdl.ready_blocks(
|
||||
self.video_id, self.duration, self.div, self.callback)
|
||||
self.blocks = [block for block in blocks if block]
|
||||
return self
|
||||
|
||||
def _remove_duplicate_head(self):
|
||||
self.blocks = duplcheck.remove_duplicate_head(self.blocks)
|
||||
return self
|
||||
|
||||
def _set_block_end(self):
|
||||
for i in range(len(self.blocks)-1):
|
||||
self.blocks[i].end = self.blocks[i+1].first
|
||||
self.blocks[-1].end = self.duration*1000
|
||||
self.blocks[-1].is_last =True
|
||||
return self
|
||||
|
||||
def _remove_overlap(self):
|
||||
self.blocks = duplcheck.remove_overlap(self.blocks)
|
||||
return self
|
||||
|
||||
def _download_blocks(self):
|
||||
asyncdl.download_patch(self.callback, self.blocks, self.video_id)
|
||||
return self
|
||||
|
||||
def _remove_duplicate_tail(self):
|
||||
self.blocks = duplcheck.remove_duplicate_tail(self.blocks)
|
||||
return self
|
||||
|
||||
def _combine(self):
|
||||
ret = []
|
||||
for block in self.blocks:
|
||||
ret.extend(block.chat_data)
|
||||
return ret
|
||||
|
||||
def download(self):
|
||||
return (
|
||||
self._ready_blocks()
|
||||
._remove_duplicate_head()
|
||||
._set_block_end()
|
||||
._remove_overlap()
|
||||
._download_blocks()
|
||||
._remove_duplicate_tail()
|
||||
._combine()
|
||||
)
|
||||
|
||||
def download(video_id, div = 1, callback = None, processor = None):
|
||||
duration = 0
|
||||
try:
|
||||
duration = VideoInfo(video_id).get("duration")
|
||||
except InvalidVideoIdException:
|
||||
raise
|
||||
if duration == 0:
|
||||
print("video is live.")
|
||||
return []
|
||||
data = Downloader(video_id, duration, div, callback).download()
|
||||
if processor is None:
|
||||
return data
|
||||
return processor.process(
|
||||
[{'video_id':None,'timeout':1,'chatdata' : [action
|
||||
["replayChatItemAction"]["actions"][0] for action in data]}]
|
||||
)
|
||||
|
||||
def cancel():
|
||||
asyncdl.cancel()
|
||||
153
pytchat/tool/download/duplcheck.py
Normal file
153
pytchat/tool/download/duplcheck.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from . import parser
|
||||
|
||||
def check_duplicate(chatdata):
|
||||
max_range = len(chatdata)-1
|
||||
tbl_offset = [None] * max_range
|
||||
tbl_id = [None] * max_range
|
||||
tbl_type = [None] * max_range
|
||||
|
||||
def create_table(chatdata, max_range):
|
||||
for i in range(max_range):
|
||||
tbl_offset[i] = parser.get_offset(chatdata[i])
|
||||
tbl_id[i] = parser.get_id(chatdata[i])
|
||||
tbl_type[i] = parser.get_type(chatdata[i])
|
||||
|
||||
def is_duplicate(i, j):
|
||||
return (
|
||||
tbl_offset[i] == tbl_offset[j]
|
||||
and
|
||||
tbl_id[i] == tbl_id[j]
|
||||
and
|
||||
tbl_type[i] == tbl_type[j]
|
||||
)
|
||||
print("creating table...")
|
||||
create_table(chatdata,max_range)
|
||||
print("searching duplicate data...")
|
||||
return [{ "i":{
|
||||
"index" : i, "id" : parser.get_id(chatdata[i]),
|
||||
"offsetTime" : parser.get_offset(chatdata[i]),
|
||||
"type" : parser.get_type(chatdata[i])
|
||||
},
|
||||
"j":{
|
||||
"index" : j, "id" : parser.get_id(chatdata[j]),
|
||||
"offsetTime" : parser.get_offset(chatdata[j]),
|
||||
"type" : parser.get_type(chatdata[j])
|
||||
}
|
||||
}
|
||||
for i in range(max_range) for j in range(i+1,max_range)
|
||||
if is_duplicate(i,j)]
|
||||
|
||||
|
||||
def check_duplicate_offset(chatdata):
|
||||
max_range = len(chatdata)
|
||||
tbl_offset = [None] * max_range
|
||||
tbl_id = [None] * max_range
|
||||
tbl_type = [None] * max_range
|
||||
|
||||
def create_table(chatdata, max_range):
|
||||
for i in range(max_range):
|
||||
tbl_offset[i] = parser.get_offset(chatdata[i])
|
||||
tbl_id[i] = parser.get_id(chatdata[i])
|
||||
tbl_type[i] = parser.get_type(chatdata[i])
|
||||
|
||||
def is_duplicate(i, j):
|
||||
return (
|
||||
tbl_offset[i] == tbl_offset[j]
|
||||
and
|
||||
tbl_id[i] == tbl_id[j]
|
||||
)
|
||||
|
||||
print("creating table...")
|
||||
create_table(chatdata,max_range)
|
||||
print("searching duplicate data...")
|
||||
|
||||
return [{
|
||||
"index" : i, "id" : tbl_id[i],
|
||||
"offsetTime" : tbl_offset[i],
|
||||
"type:" : tbl_type[i]
|
||||
}
|
||||
for i in range(max_range-1)
|
||||
if is_duplicate(i,i+1)]
|
||||
|
||||
def remove_duplicate_head(blocks):
|
||||
if len(blocks) == 1 : return blocks
|
||||
|
||||
def is_duplicate_head(index):
|
||||
|
||||
if len(blocks[index].chat_data) == 0:
|
||||
return True
|
||||
elif len(blocks[index+1].chat_data) == 0:
|
||||
return False
|
||||
|
||||
id_0 = parser.get_id(blocks[index].chat_data[0])
|
||||
id_1 = parser.get_id(blocks[index+1].chat_data[0])
|
||||
type_0 = parser.get_type(blocks[index].chat_data[0])
|
||||
type_1 = parser.get_type(blocks[index+1].chat_data[0])
|
||||
return (
|
||||
blocks[index].first == blocks[index+1].first
|
||||
and
|
||||
id_0 == id_1
|
||||
and
|
||||
type_0 == type_1
|
||||
)
|
||||
ret = [blocks[i] for i in range(len(blocks)-1)
|
||||
if (len(blocks[i].chat_data)>0 and
|
||||
not is_duplicate_head(i) )]
|
||||
ret.append(blocks[-1])
|
||||
return ret
|
||||
|
||||
def remove_duplicate_tail(blocks):
|
||||
if len(blocks) == 1 : return blocks
|
||||
|
||||
def is_duplicate_tail(index):
|
||||
if len(blocks[index].chat_data) == 0:
|
||||
return True
|
||||
elif len(blocks[index-1].chat_data) == 0:
|
||||
return False
|
||||
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
|
||||
id_1 = parser.get_id(blocks[index].chat_data[-1])
|
||||
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
|
||||
type_1 = parser.get_type(blocks[index].chat_data[-1])
|
||||
return (
|
||||
blocks[index-1].last == blocks[index].last
|
||||
and
|
||||
id_0 == id_1
|
||||
and
|
||||
type_0 == type_1
|
||||
)
|
||||
|
||||
ret = [blocks[i] for i in range(0,len(blocks))
|
||||
if i == 0 or not is_duplicate_tail(i) ]
|
||||
return ret
|
||||
|
||||
def remove_overlap(blocks):
|
||||
"""
|
||||
Fix overlapped blocks after ready_blocks().
|
||||
Align the last offset of each block to the first offset
|
||||
of next block (equals `end` offset of each block).
|
||||
"""
|
||||
if len(blocks) == 1 : return blocks
|
||||
|
||||
for block in blocks:
|
||||
if block.is_last:
|
||||
break
|
||||
if len(block.chat_data)==0:
|
||||
continue
|
||||
block_end = block.end
|
||||
if block.last >= block_end:
|
||||
for line in reversed(block.chat_data):
|
||||
if parser.get_offset(line) < block_end:
|
||||
break
|
||||
block.chat_data.pop()
|
||||
block.last = parser.get_offset(line)
|
||||
block.remaining=0
|
||||
block.done=True
|
||||
block.continuation = None
|
||||
return blocks
|
||||
|
||||
|
||||
|
||||
def _dump(blocks):
|
||||
print(f"---------- first last end---")
|
||||
for i,block in enumerate(blocks):
|
||||
print(f"block[{i:3}] {block.first:>10} {block.last:>10} {block.end:>10}")
|
||||
54
pytchat/tool/download/parser.py
Normal file
54
pytchat/tool/download/parser.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import json
|
||||
from ... import config
|
||||
from ... exceptions import (
|
||||
ResponseContextError,
|
||||
NoContentsException,
|
||||
NoContinuationsException )
|
||||
|
||||
logger = config.logger(__name__)
|
||||
|
||||
def parse(jsn):
|
||||
"""
|
||||
Parse replay chat data.
|
||||
Parameter:
|
||||
----------
|
||||
jsn : dict
|
||||
JSON of replay chat data.
|
||||
Returns:
|
||||
------
|
||||
continuation : str
|
||||
actions : list
|
||||
|
||||
"""
|
||||
if jsn is None:
|
||||
raise ValueError("parameter JSON is None")
|
||||
if jsn['response']['responseContext'].get('errors'):
|
||||
raise ResponseContextError(
|
||||
'video_id is invalid or private/deleted.')
|
||||
contents=jsn['response'].get('continuationContents')
|
||||
if contents is None:
|
||||
raise NoContentsException('No chat data.')
|
||||
|
||||
cont = contents['liveChatContinuation']['continuations'][0]
|
||||
if cont is None:
|
||||
raise NoContinuationsException('No Continuation')
|
||||
metadata = cont.get('liveChatReplayContinuationData')
|
||||
if metadata:
|
||||
continuation = metadata.get("continuation")
|
||||
actions = contents['liveChatContinuation'].get('actions')
|
||||
return continuation, actions
|
||||
return None, []
|
||||
|
||||
|
||||
def get_offset(item):
|
||||
return int(item['replayChatItemAction']["videoOffsetTimeMsec"])
|
||||
|
||||
def get_id(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].values())[0].get('id')
|
||||
|
||||
def get_type(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].keys())[0]
|
||||
|
||||
|
||||
54
pytchat/tool/download/patch.py
Normal file
54
pytchat/tool/download/patch.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from typing import NamedTuple
|
||||
|
||||
class Patch(NamedTuple):
|
||||
"""
|
||||
Patch represents chunk of chat data
|
||||
which is fetched by asyncdl.download_patch._fetch().
|
||||
"""
|
||||
chats : list = []
|
||||
continuation : str = None
|
||||
first : int = None
|
||||
last : int = None
|
||||
|
||||
def fill(block:Block, patch:Patch):
|
||||
block_end = block.end
|
||||
if patch.last < block_end or block.is_last:
|
||||
set_patch(block, patch)
|
||||
return
|
||||
for line in reversed(patch.chats):
|
||||
line_offset = parser.get_offset(line)
|
||||
if line_offset < block_end:
|
||||
break
|
||||
patch.chats.pop()
|
||||
|
||||
set_patch(block, patch._replace(
|
||||
continuation = None,
|
||||
last = line_offset
|
||||
)
|
||||
)
|
||||
block.remaining=0
|
||||
block.done=True
|
||||
|
||||
|
||||
def split(parent_block:Block, child_block:Block, patch:Patch):
|
||||
parent_block.during_split = False
|
||||
"""patch overlaps with parent_block"""
|
||||
if patch.first <= parent_block.last:
|
||||
child_block.continuation = None
|
||||
''' Leave child_block.during_split == True
|
||||
to exclude from during_split sequence.'''
|
||||
return
|
||||
child_block.during_split = False
|
||||
child_block.first=patch.first
|
||||
parent_block.end =patch.first
|
||||
fill(child_block, patch)
|
||||
|
||||
|
||||
def set_patch(block:Block, patch:Patch):
|
||||
block.continuation = patch.continuation
|
||||
block.chat_data.extend(patch.chats)
|
||||
block.last = patch.last
|
||||
block.remaining = block.end-block.last
|
||||
|
||||
42
pytchat/tool/videoinfo.py
Normal file
42
pytchat/tool/videoinfo.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from .. import config
|
||||
from .. import util
|
||||
from ..exceptions import InvalidVideoIdException
|
||||
headers = config.headers
|
||||
pattern=re.compile(r"yt\.setConfig\({'PLAYER_CONFIG': ({.*})}\);")
|
||||
|
||||
class VideoInfo:
|
||||
def __init__(self,video_id):
|
||||
self.video_id = video_id
|
||||
self.info = self._get_info(video_id)
|
||||
|
||||
def _get_info(self,video_id):
|
||||
url = f"https://www.youtube.com/embed/{video_id}"
|
||||
resp= requests.get(url, headers = headers)
|
||||
resp.raise_for_status()
|
||||
return self._parse(resp.text)
|
||||
|
||||
def _parse(self,html):
|
||||
result = re.search(pattern, html)
|
||||
res= json.loads(result.group(1))
|
||||
response = res["args"].get("embedded_player_response")
|
||||
if response is None:
|
||||
raise InvalidVideoIdException("動画IDが無効です。")
|
||||
renderer = (json.loads(response))["embedPreview"]["thumbnailPreviewRenderer"]
|
||||
return {
|
||||
"duration": int(renderer["videoDurationSeconds"]) if renderer.get("videoDurationSeconds") else 0,
|
||||
"title" : [''.join(run["text"]) for run in renderer["title"]["runs"]][0] if renderer.get("title") else None,
|
||||
"channelId" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["channelThumbnailEndpoint"]["channelThumbnailEndpoint"]["urlEndpoint"]["urlEndpoint"]["url"][9:] if renderer.get("videoDetails") else None,
|
||||
"authorProfileImage" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["channelThumbnail"]["thumbnails"][0]["url"] if renderer.get("videoDetails") else None,
|
||||
"thumbnail" : renderer["defaultThumbnail"]["thumbnails"][2]["url"] if renderer.get("defaultThumbnail") else None,
|
||||
"channelName" : renderer["videoDetails"]["embeddedPlayerOverlayVideoDetailsRenderer"]["expandedRenderer"]["embeddedPlayerOverlayVideoDetailsExpandedRenderer"]["title"]["runs"][0]["text"] if renderer.get("videoDetails") else None,
|
||||
"movingThumbnail" : renderer["movingThumbnail"]["thumbnails"][0]["url"] if renderer.get("movingThumbnail") else None
|
||||
}
|
||||
|
||||
def get(self,item):
|
||||
return self.info.get(item)
|
||||
|
||||
|
||||
|
||||
77
tests/test_dl_asyncdl.py
Normal file
77
tests/test_dl_asyncdl.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
from pytchat.tool.download import parser
|
||||
import sys
|
||||
import time
|
||||
from aioresponses import aioresponses
|
||||
from concurrent.futures import CancelledError
|
||||
from pytchat.tool.download import asyncdl
|
||||
|
||||
def _open_file(path):
|
||||
with open(path,mode ='r',encoding = 'utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def test_asyncdl_split():
|
||||
|
||||
ret = asyncdl._split(0,1000,1)
|
||||
assert ret == [0]
|
||||
|
||||
ret = asyncdl._split(1000,1000,10)
|
||||
assert ret == [1000]
|
||||
|
||||
ret = asyncdl._split(0,1000,5)
|
||||
assert ret == [0,200,400,600,800]
|
||||
|
||||
ret = asyncdl._split(10.5, 700.3, 5)
|
||||
assert ret == [10, 148, 286, 424, 562]
|
||||
|
||||
|
||||
ret = asyncdl._split(0,500,5)
|
||||
assert ret == [0,125,250,375]
|
||||
|
||||
ret = asyncdl._split(0,500,500)
|
||||
assert ret == [0,125,250,375]
|
||||
|
||||
ret = asyncdl._split(-1,1000,5)
|
||||
assert ret == [-1, 199, 399, 599, 799]
|
||||
|
||||
"""invalid argument order"""
|
||||
try:
|
||||
ret = asyncdl._split(500,0,5)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
"""invalid count"""
|
||||
try:
|
||||
ret = asyncdl._split(0,500,-1)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
try:
|
||||
ret = asyncdl._split(0,500,0)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
"""invalid argument type"""
|
||||
try:
|
||||
ret = asyncdl._split(0,5000,5.2)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
try:
|
||||
ret = asyncdl._split(0,5000,"test")
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
try:
|
||||
ret = asyncdl._split([0,1],5000,5)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
128
tests/test_dl_duplcheck.py
Normal file
128
tests/test_dl_duplcheck.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import os, sys
|
||||
import time
|
||||
from pytchat.tool.download import duplcheck
|
||||
from pytchat.tool.download import parser
|
||||
from pytchat.tool.download.block import Block
|
||||
from pytchat.tool.download.duplcheck import _dump
|
||||
def _open_file(path):
|
||||
with open(path,mode ='r',encoding = 'utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
|
||||
def test_overlap():
|
||||
"""
|
||||
test overlap data
|
||||
operation : [0] [2] [3] [4] -> last :align to end
|
||||
[1] , [5] -> no change
|
||||
|
||||
"""
|
||||
|
||||
def load_chatdata(filename):
|
||||
return parser.parse(
|
||||
json.loads(_open_file("tests/testdata/dl_duplcheck/overlap/"+filename))
|
||||
)[1]
|
||||
|
||||
blocks = (
|
||||
Block(first = 0, last= 12771, end= 9890,chat_data = load_chatdata("dp0-0.json")),
|
||||
Block(first = 9890, last= 15800, end= 20244,chat_data = load_chatdata("dp0-1.json")),
|
||||
Block(first = 20244,last= 45146, end= 32476,chat_data = load_chatdata("dp0-2.json")),
|
||||
Block(first = 32476,last= 50520, end= 41380,chat_data = load_chatdata("dp0-3.json")),
|
||||
Block(first = 41380,last= 62875, end= 52568,chat_data = load_chatdata("dp0-4.json")),
|
||||
Block(first = 52568,last= 62875, end= 54000,chat_data = load_chatdata("dp0-5.json"),is_last=True)
|
||||
)
|
||||
result = duplcheck.remove_overlap(blocks)
|
||||
#dp0-0.json has item offset time is 9890 (equals block[0].end = block[1].first),
|
||||
#but must be aligne to the most close and smaller value:9779.
|
||||
assert result[0].last == 9779
|
||||
|
||||
assert result[1].last == 15800
|
||||
|
||||
assert result[2].last == 32196
|
||||
|
||||
assert result[3].last == 41116
|
||||
|
||||
assert result[4].last == 52384
|
||||
|
||||
#the last block must be always added to result.
|
||||
assert result[5].last == 62875
|
||||
|
||||
def test_duplicate_head():
|
||||
|
||||
def load_chatdata(filename):
|
||||
return parser.parse(
|
||||
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
|
||||
)[1]
|
||||
|
||||
"""
|
||||
test duplicate head data
|
||||
operation : [0] , [1] -> discard [0]
|
||||
[1] , [2] -> discard [1]
|
||||
[2] , [3] -> append [2]
|
||||
[3] , [4] -> discard [3]
|
||||
[4] , [5] -> append [4]
|
||||
append [5]
|
||||
|
||||
result : [0] , [3] , [5]
|
||||
"""
|
||||
|
||||
#chat data offsets are ignored.
|
||||
blocks = (
|
||||
Block(first = 0, last = 2500, chat_data = load_chatdata("dp0-0.json")),
|
||||
Block(first = 0, last =38771, chat_data = load_chatdata("dp0-1.json")),
|
||||
Block(first = 0, last =45146, chat_data = load_chatdata("dp0-2.json")),
|
||||
Block(first = 20244, last =60520, chat_data = load_chatdata("dp0-3.json")),
|
||||
Block(first = 20244, last =62875, chat_data = load_chatdata("dp0-4.json")),
|
||||
Block(first = 52568, last =62875, chat_data = load_chatdata("dp0-5.json"))
|
||||
)
|
||||
_dump(blocks)
|
||||
result = duplcheck.remove_duplicate_head(blocks)
|
||||
|
||||
assert len(result) == 3
|
||||
assert result[0].first == blocks[2].first
|
||||
assert result[0].last == blocks[2].last
|
||||
assert result[1].first == blocks[4].first
|
||||
assert result[1].last == blocks[4].last
|
||||
assert result[2].first == blocks[5].first
|
||||
assert result[2].last == blocks[5].last
|
||||
|
||||
def test_duplicate_tail():
|
||||
"""
|
||||
test duplicate tail data
|
||||
operation : append [0]
|
||||
[0] , [1] -> discard [1]
|
||||
[1] , [2] -> append [2]
|
||||
[2] , [3] -> discard [3]
|
||||
[3] , [4] -> append [4]
|
||||
[4] , [5] -> discard [5]
|
||||
|
||||
result : [0] , [2] , [4]
|
||||
"""
|
||||
def load_chatdata(filename):
|
||||
return parser.parse(
|
||||
json.loads(_open_file("tests/testdata/dl_duplcheck/head/"+filename))
|
||||
)[1]
|
||||
#chat data offsets are ignored.
|
||||
blocks = (
|
||||
Block(first = 0,last = 2500, chat_data=load_chatdata("dp0-0.json")),
|
||||
Block(first = 1500,last = 2500, chat_data=load_chatdata("dp0-1.json")),
|
||||
Block(first = 10000,last = 45146, chat_data=load_chatdata("dp0-2.json")),
|
||||
Block(first = 20244,last = 45146, chat_data=load_chatdata("dp0-3.json")),
|
||||
Block(first = 20244,last = 62875, chat_data=load_chatdata("dp0-4.json")),
|
||||
Block(first = 52568,last = 62875, chat_data=load_chatdata("dp0-5.json"))
|
||||
)
|
||||
|
||||
result = duplcheck.remove_duplicate_tail(blocks)
|
||||
_dump(result)
|
||||
assert len(result) == 3
|
||||
assert result[0].first == blocks[0].first
|
||||
assert result[0].last == blocks[0].last
|
||||
assert result[1].first == blocks[2].first
|
||||
assert result[1].last == blocks[2].last
|
||||
assert result[2].first == blocks[4].first
|
||||
assert result[2].last == blocks[4].last
|
||||
|
||||
|
||||
232
tests/test_patch.py
Normal file
232
tests/test_patch.py
Normal file
@@ -0,0 +1,232 @@
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import os, sys
|
||||
import time
|
||||
from aioresponses import aioresponses
|
||||
from pytchat.tool.download import duplcheck
|
||||
from pytchat.tool.download import parser
|
||||
from pytchat.tool.download.block import Block
|
||||
from pytchat.tool.download.patch import Patch, fill, split, set_patch
|
||||
from pytchat.tool.download.duplcheck import _dump
|
||||
def _open_file(path):
|
||||
with open(path,mode ='r',encoding = 'utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
def load_chatdata(filename):
|
||||
return parser.parse(
|
||||
json.loads(_open_file("tests/testdata/dl_patch/"+filename))
|
||||
)[1]
|
||||
|
||||
|
||||
def test_split_0():
|
||||
"""
|
||||
Normal case
|
||||
|
||||
@parent_block (# = already downloaded)
|
||||
|
||||
first last end
|
||||
|########----------------------------------------|
|
||||
|
||||
|
||||
@child_block
|
||||
|
||||
first = last = 0 end=parent_end
|
||||
---------------------------------------------------|
|
||||
|
||||
|
||||
@fetched patch
|
||||
|-- patch --|
|
||||
|
||||
|
||||
|
|
||||
|
|
||||
V
|
||||
|
||||
@parent_block
|
||||
|
||||
first last end (after split)
|
||||
|########------------|
|
||||
|
||||
@child_block
|
||||
first last end
|
||||
|###########---------------|
|
||||
|
||||
@fetched patch
|
||||
|-- patch --|
|
||||
"""
|
||||
parent = Block(first=0, last=4000, end=60000, continuation='parent', during_split=True)
|
||||
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
|
||||
patch = Patch(chats=load_chatdata('pt0-5.json'),
|
||||
first=32500, last=34000, continuation='patch')
|
||||
|
||||
split(parent,child,patch)
|
||||
|
||||
assert child.continuation == 'patch'
|
||||
assert parent.last < child.first
|
||||
assert parent.end == child.first
|
||||
assert child.first < child.last
|
||||
assert child.last < child.end
|
||||
assert parent.during_split == False
|
||||
assert child.during_split == False
|
||||
|
||||
def test_split_1():
|
||||
"""patch.first <= parent_block.last
|
||||
|
||||
While awaiting at run()->asyncdl._fetch()
|
||||
downloading parent_block proceeds,
|
||||
and parent.block.last exceeds patch.first.
|
||||
|
||||
In this case, fetched patch is all discarded,
|
||||
and dlworker searches other processing block again.
|
||||
|
||||
~~~~~~ before ~~~~~~
|
||||
|
||||
patch.first
|
||||
first | last end
|
||||
|####################|#####|---------------------|
|
||||
^
|
||||
@child_block
|
||||
first = last = 0 end=parent_end
|
||||
---------------------------------------------------|
|
||||
|
||||
@fetched patch
|
||||
|-- patch --|
|
||||
|
||||
|
||||
|
|
||||
|
|
||||
V
|
||||
|
||||
~~~~~~ after ~~~~~~
|
||||
|
||||
@parent_block
|
||||
first last end
|
||||
|###########################|--------------------|
|
||||
|
||||
@child_block
|
||||
|
||||
.............. -> discard all data
|
||||
|
||||
"""
|
||||
parent = Block(first=0, last=33000, end=60000, continuation='parent', during_split=True)
|
||||
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
|
||||
patch = Patch(chats=load_chatdata('pt0-5.json'),
|
||||
first=32500, last=34000, continuation='patch')
|
||||
|
||||
split(parent,child,patch)
|
||||
|
||||
assert parent.last == 33000 #no change
|
||||
assert parent.end == 60000 #no change
|
||||
assert child.continuation is None
|
||||
assert parent.during_split == False
|
||||
assert child.during_split == True #exclude during_split sequence
|
||||
|
||||
def test_split_2():
|
||||
"""child_block.end < patch.last:
|
||||
|
||||
Case the last offset of patch exceeds child_block.end.
|
||||
In this case, remove overlapped data of patch.
|
||||
|
||||
~~~~~~ before ~~~~~~
|
||||
|
||||
@parent_block (# = already downloaded)
|
||||
first last end (before split)
|
||||
|########------------------------------|
|
||||
|
||||
@child_block
|
||||
first = last = 0 end=parent_end
|
||||
-----------------------------------------|
|
||||
|
||||
continuation:succeed from patch
|
||||
|
||||
@fetched patch
|
||||
|-------- patch --------|
|
||||
|
||||
|
||||
|
|
||||
|
|
||||
V
|
||||
|
||||
~~~~~~ after ~~~~~~
|
||||
|
||||
@parent_block
|
||||
first last end (after split)
|
||||
|########------------|
|
||||
|
||||
@child_block old patch.end
|
||||
first last=end |
|
||||
|#################|...... cut extra data.
|
||||
^
|
||||
continuation : None (download complete)
|
||||
|
||||
@fetched patch
|
||||
|-------- patch --------|
|
||||
"""
|
||||
parent = Block(first=0, last=4000, end=33500, continuation='parent', during_split=True)
|
||||
child = Block(first=0, last=0, end=33500, continuation='mean', during_split=True)
|
||||
patch = Patch(chats=load_chatdata('pt0-5.json'),
|
||||
first=32500, last=34000, continuation='patch')
|
||||
|
||||
split(parent,child,patch)
|
||||
|
||||
assert child.continuation is None
|
||||
assert parent.last < child.first
|
||||
assert parent.end == child.first
|
||||
assert child.first < child.last
|
||||
assert child.last < child.end
|
||||
assert child.continuation is None
|
||||
assert parent.during_split == False
|
||||
assert child.during_split == False
|
||||
|
||||
def test_split_none():
|
||||
"""patch.last <= parent_block.last
|
||||
|
||||
While awaiting at run()->asyncdl._fetch()
|
||||
downloading parent_block proceeds,
|
||||
and parent.block.last exceeds patch.first.
|
||||
|
||||
In this case, fetched patch is all discarded,
|
||||
and dlworker searches other processing block again.
|
||||
|
||||
~~~~~~ before ~~~~~~
|
||||
|
||||
patch.first
|
||||
first | last end
|
||||
|####################|###################|-------|
|
||||
^
|
||||
@child_block
|
||||
first = last = 0 end=parent_end
|
||||
---------------------------------------------------|
|
||||
|
||||
@fetched patch
|
||||
|-- patch --|
|
||||
patch.last < parent_block.last .
|
||||
|
||||
|
|
||||
|
|
||||
V
|
||||
|
||||
~~~~~~ after ~~~~~~
|
||||
|
||||
@parent_block
|
||||
first last end (before split)
|
||||
|########################################|-------|
|
||||
.
|
||||
@child_block
|
||||
|
||||
............ -> discard all data.
|
||||
|
||||
"""
|
||||
parent = Block(first=0, last=40000, end=60000, continuation='parent', during_split=True)
|
||||
child = Block(first=0, last=0, end=60000, continuation='mean', during_split=True)
|
||||
patch = Patch(chats=load_chatdata('pt0-5.json'),
|
||||
first=32500, last=34000, continuation='patch')
|
||||
|
||||
split(parent,child,patch)
|
||||
|
||||
assert parent.last == 40000 #no change
|
||||
assert parent.end == 60000 #no change
|
||||
assert child.continuation is None
|
||||
assert parent.during_split == False
|
||||
assert child.during_split == True #exclude during_split sequence
|
||||
6128
tests/testdata/dl_duplcheck/head/dp0-0.json
vendored
Normal file
6128
tests/testdata/dl_duplcheck/head/dp0-0.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/head/dp0-1.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/head/dp0-1.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/head/dp0-2.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/head/dp0-2.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/head/dp0-3.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/head/dp0-3.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2529
tests/testdata/dl_duplcheck/head/dp0-4.json
vendored
Normal file
2529
tests/testdata/dl_duplcheck/head/dp0-4.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1431
tests/testdata/dl_duplcheck/head/dp0-5.json
vendored
Normal file
1431
tests/testdata/dl_duplcheck/head/dp0-5.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6128
tests/testdata/dl_duplcheck/overlap/dp0-0.json
vendored
Normal file
6128
tests/testdata/dl_duplcheck/overlap/dp0-0.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/overlap/dp0-1.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/overlap/dp0-1.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/overlap/dp0-2.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/overlap/dp0-2.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_duplcheck/overlap/dp0-3.json
vendored
Normal file
3078
tests/testdata/dl_duplcheck/overlap/dp0-3.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
2529
tests/testdata/dl_duplcheck/overlap/dp0-4.json
vendored
Normal file
2529
tests/testdata/dl_duplcheck/overlap/dp0-4.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1431
tests/testdata/dl_duplcheck/overlap/dp0-5.json
vendored
Normal file
1431
tests/testdata/dl_duplcheck/overlap/dp0-5.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_patch/pt0-0.json
vendored
Normal file
3078
tests/testdata/dl_patch/pt0-0.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_patch/pt0-1.json
vendored
Normal file
3078
tests/testdata/dl_patch/pt0-1.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_patch/pt0-3.json
vendored
Normal file
3078
tests/testdata/dl_patch/pt0-3.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_patch/pt0-4.json
vendored
Normal file
3078
tests/testdata/dl_patch/pt0-4.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3078
tests/testdata/dl_patch/pt0-5.json
vendored
Normal file
3078
tests/testdata/dl_patch/pt0-5.json
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user