diff --git a/pytchat/tool/asyncdl.py b/pytchat/tool/asyncdl.py index d227118..e87b74e 100644 --- a/pytchat/tool/asyncdl.py +++ b/pytchat/tool/asyncdl.py @@ -13,29 +13,29 @@ headers = config.headers REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \ "get_live_chat_replay?continuation=" +def _divide(start, end, count): + min_interval = 120 + if (not isinstance(start,int) or + not isinstance(end,int) or + not isinstance(count,int)): + raise ValueError("start/end/count must be int") + if start>end: + raise ValueError("end must be equal to or greater than start.") + if count<1: + raise ValueError("count must be equal to or greater than 1.") + if (end-start)/count < min_interval: + count = int((end-start)/min_interval) + if count == 0 : count = 1 + interval= (end-start)/count + + if count == 1: + return [start] + return sorted(list(set([int(start+interval*j) + for j in range(count) ]))) + def ready_blocks(video_id, duration, div, callback): if div <= 0: raise ValueError - def _divide(start, end, count): - min_interval = 120 - if (not isinstance(start,int) or - not isinstance(end,int) or - not isinstance(count,int)): - raise ValueError("start/end/count must be int") - if start>end: - raise ValueError("end must be equal to or greater than start.") - if count<1: - raise ValueError("count must be equal to or greater than 1.") - if (end-start)/count < min_interval: - count = int((end-start)/min_interval) - if count == 0 : count = 1 - interval= (end-start)/count - - if count == 1: - return [start] - return sorted(list(set([int(start+interval*j) - for j in range(count) ]))) - async def _get_blocks( video_id, duration, div, callback): async with aiohttp.ClientSession() as session: futures = [_create_block(session, video_id, pos, seektime, callback) @@ -70,7 +70,7 @@ def ready_blocks(video_id, duration, div, callback): def download_chunk(callback, blocks): - async def _dl_distribute(): + async def _allocate_workers(): workers = [ DownloadWorker( fetch = _fetch, @@ -85,16 +85,15 @@ def download_chunk(callback, blocks): async def _fetch(continuation,session): url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" async with session.get(url,headers = config.headers) as resp: - text = await resp.text() - continuation, actions = parser.parse(json.loads(text)) + chat_json = await resp.text() + continuation, actions = parser.parse(json.loads(chat_json)) if actions: last = parser.get_offset(actions[-1]) first = parser.get_offset(actions[0]) if callback: - callback(actions,last-first) - return actions,continuation,last + callback(actions, last - first) + return actions, continuation, last return continuation, [], None loop = asyncio.get_event_loop() - loop.run_until_complete( - _dl_distribute()) + loop.run_until_complete(_allocate_workers()) diff --git a/pytchat/tool/block.py b/pytchat/tool/block.py index 60f650a..39fa162 100644 --- a/pytchat/tool/block.py +++ b/pytchat/tool/block.py @@ -1,4 +1,30 @@ class Block: + """Block object represents virtual chunk of chatdata. + + Parameter: + --------- + pos : int + index of this block on block list. + + first : int + videoOffsetTimeMs of chat_data[0] + + last : int + videoOffsetTimeMs of the last chat_data current read. + (chat_data[-1]) + + this value increases as fetching chatdata progresses. + + temp_last : int + temporary videoOffsetTimeMs of last chat data, + equals to first videoOffsetTimeMs of next block. + when download worker reaches this offset, the download will stop. + + continuation : str + continuation param of last chat data. + + chat_data : List + """ def __init__(self, pos=0, first=0, last=0, continuation='', chat_data=[]): self.pos = pos diff --git a/pytchat/tool/dlworker.py b/pytchat/tool/dlworker.py index b70f9ef..914949a 100644 --- a/pytchat/tool/dlworker.py +++ b/pytchat/tool/dlworker.py @@ -1,23 +1,37 @@ from . import parser class DownloadWorker: + """ + DownloadWorker : associates a download session with a block. + + Parameter + ---------- + fetch : func + download function of asyncdl + + block : Block + chunk of chat_data + """ def __init__(self, fetch, block): self.block = block self.fetch = fetch - async def run(self,session): + async def run(self, session): + """Remove extra chats just after ready_blocks(). """ temp_last = self.block.temp_last self.block.chat_data, continuation = self.cut( self.block.chat_data, self.block.continuation, self.block.last, temp_last ) + """download loop """ while continuation: data, cont, fetched_last = await self.fetch(continuation, session) data, continuation = self.cut(data, cont, fetched_last, temp_last) self.block.chat_data.extend(data) def cut(self, data, cont, fetched_last, temp_last): + """Remove extra chats.""" if fetched_last < temp_last or temp_last == -1: return data, cont for i, line in enumerate(data): diff --git a/pytchat/tool/downloader.py b/pytchat/tool/downloader.py index 81d3b96..577a950 100644 --- a/pytchat/tool/downloader.py +++ b/pytchat/tool/downloader.py @@ -1,8 +1,3 @@ -import asyncio -import aiohttp -import json -import traceback -from urllib.parse import quote from . import asyncdl from . import parser from . import videoinfo @@ -10,7 +5,6 @@ from . block import Block from . duplcheck import duplicate_head, duplicate_tail, overwrap from .. import config from .. exceptions import InvalidVideoIdException -from .. paramgen import arcparam logger = config.logger(__name__) headers=config.headers @@ -20,8 +14,8 @@ class Downloader: self.video_id = video_id self.duration = duration self.div = div - self.blocks = [] self.callback = callback + self.blocks = [] def ready_blocks(self): result = asyncdl.ready_blocks( diff --git a/pytchat/tool/duplcheck.py b/pytchat/tool/duplcheck.py index ff5dca9..62d5d69 100644 --- a/pytchat/tool/duplcheck.py +++ b/pytchat/tool/duplcheck.py @@ -3,16 +3,16 @@ from . import parser def check_duplicate(chatdata): max_range = len(chatdata)-1 tbl_offset = [None] * max_range - tbl_id =[None] * max_range - tbl_type=[None] * max_range + tbl_id = [None] * max_range + tbl_type = [None] * max_range - def create_table(chatdata,max_range): + def create_table(chatdata, max_range): for i in range(max_range): tbl_offset[i] = parser.get_offset(chatdata[i]) tbl_id[i] = parser.get_id(chatdata[i]) tbl_type[i] = parser.get_type(chatdata[i]) - def is_duplicate(i,j): + def is_duplicate(i, j): return ( tbl_offset[i] == tbl_offset[j] and