diff --git a/pytchat/__init__.py b/pytchat/__init__.py index 74eea41..161568c 100644 --- a/pytchat/__init__.py +++ b/pytchat/__init__.py @@ -2,7 +2,7 @@ pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup. """ __copyright__ = 'Copyright (C) 2019 taizan-hokuto' -__version__ = '0.1.9' +__version__ = '0.2.0' __license__ = 'MIT' __author__ = 'taizan-hokuto' __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com' diff --git a/pytchat/cli/__init__.py b/pytchat/cli/__init__.py index c11d1b5..a448e74 100644 --- a/pytchat/cli/__init__.py +++ b/pytchat/cli/__init__.py @@ -1,11 +1,11 @@ import argparse import os +import sys import signal from json.decoder import JSONDecodeError from pathlib import Path from .arguments import Arguments -from .progressbar import ProgressBar from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError from .. processors.html_archiver import HTMLArchiver from .. tool.extract.extractor import Extractor @@ -32,11 +32,18 @@ def main(): 'If ID starts with a hyphen (-), enclose the ID in square brackets.') parser.add_argument('-o', f'--{Arguments.Name.OUTPUT}', type=str, help='Output directory (end with "/"). default="./"', default='./') - parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', - help='Show version') + parser.add_argument(f'--{Arguments.Name.PBAR}', action='store_true', + help='Display rich progress bar') parser.add_argument(f'--{Arguments.Name.SAVE_ERROR_DATA}', action='store_true', help='Save error data when error occurs(".dat" file)') + parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', + help='Show version') Arguments(parser.parse_args().__dict__) + + if Arguments().pbar: + from .progressbar_rich import ProgressBar + else: + from .progressbar_simple import ProgressBar if Arguments().print_version: print(f'pytchat v{__version__} © 2019 taizan-hokuto') return @@ -62,15 +69,18 @@ def main(): print(f" output path: {path.resolve()}") duration = info.get_duration() - pbar = ProgressBar(total=(duration * 1000) / 0.99, status="Extracting") - ex = Extractor(video_id, + pbar = ProgressBar(total=(duration * 1000), status="Extracting") + ex = Extractor(video_id, callback=pbar._disp, div=10) signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar))) data = ex.extract() if data == []: return False - pbar.reset("#", "=", total=len(data), status="Rendering ") + if Arguments().pbar: + pbar.reset("#", "=", total=len(data), status="Rendering ") + else: + pbar.reset("=", "", total=len(data), status="Rendering ") processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp) processor.process( [{'video_id': None, @@ -78,8 +88,13 @@ def main(): 'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}] ) processor.finalize() - pbar.reset('#', '#', status='Completed ') - pbar.close() + if Arguments().pbar: + pbar.reset('#', '#', status='Completed ') + pbar.close() + else: + pbar.close() + print("\nCompleted") + print() if pbar.is_cancelled(): print("\nThe extraction process has been discontinued.\n") @@ -106,6 +121,6 @@ def main(): return -def cancel(ex: Extractor, pbar: ProgressBar): +def cancel(ex, pbar): ex.cancel() pbar.cancel() diff --git a/pytchat/cli/arguments.py b/pytchat/cli/arguments.py index 0e8e9e8..e22b293 100644 --- a/pytchat/cli/arguments.py +++ b/pytchat/cli/arguments.py @@ -19,6 +19,7 @@ class Arguments(metaclass=Singleton): OUTPUT: str = 'output_dir' VIDEO_IDS: str = 'video_id' SAVE_ERROR_DATA: bool = 'save_error_data' + PBAR: bool ='pbar' def __init__(self, arguments: Optional[Dict[str, Union[str, bool, int]]] = None): @@ -36,6 +37,7 @@ class Arguments(metaclass=Singleton): self.output: str = arguments[Arguments.Name.OUTPUT] self.video_ids: List[int] = [] self.save_error_data: bool = arguments[Arguments.Name.SAVE_ERROR_DATA] + self.pbar: bool = arguments[Arguments.Name.PBAR] # Videos if arguments[Arguments.Name.VIDEO_IDS]: self.video_ids = [video_id diff --git a/pytchat/cli/progressbar.py b/pytchat/cli/progressbar_rich.py similarity index 90% rename from pytchat/cli/progressbar.py rename to pytchat/cli/progressbar_rich.py index b61d6bc..65ca9b0 100644 --- a/pytchat/cli/progressbar.py +++ b/pytchat/cli/progressbar_rich.py @@ -6,8 +6,6 @@ https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 ''' import sys -ROT = ['\u25F4', '\u25F5', '\u25F6', '\u25F7'] - class ProgressBar: def __init__(self, total, status): @@ -39,7 +37,7 @@ class ProgressBar: bar = self._symbol_done * filled_len + \ self._symbol_space * (self._bar_len - filled_len) - sys.stdout.write(' [%s] %s%s ...%s %s \r' % (bar, percents, '%', self._status, ROT[self._blinker % 4])) + sys.stdout.write(' [%s] %s%s ...%s \r' % (bar, percents, '%', self._status)) sys.stdout.flush() self._blinker += 1 diff --git a/pytchat/cli/progressbar_simple.py b/pytchat/cli/progressbar_simple.py new file mode 100644 index 0000000..776fad2 --- /dev/null +++ b/pytchat/cli/progressbar_simple.py @@ -0,0 +1,49 @@ +''' +This code for this progress bar is based on +vladignatyev/progress.py +https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 +(MIT License) +''' +import sys + + +class ProgressBar: + def __init__(self, total, status): + self._bar_len = 60 + self._cancelled = False + print(''.join([' ' * 10, '|', '-' * (self._bar_len), '|']), end="") + self.reset(total=total, status=status) + + def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''): + self._symbol_done = symbol_done + self._symbol_space = symbol_space + self._total = total + self._status = status + self._old_len = 0 + self._count = 0 + print() + print(f'{status:<11}', end='') + + def _disp(self, _, fetched): + self._progress(fetched, self._total) + + def _progress(self, fillin, total): + if total == 0 or self._cancelled: + return + self._count += fillin + filled_len = int(round(self._bar_len * self._count / float(total))) + if filled_len > self._bar_len: + filled_len = self._bar_len + print((filled_len - self._old_len) * self._symbol_done, end="") + sys.stdout.flush() + self._old_len = filled_len + + def close(self): + if not self._cancelled: + self._progress(self._total, self._total) + + def cancel(self): + self._cancelled = True + + def is_cancelled(self): + return self._cancelled diff --git a/pytchat/tool/extract/parser.py b/pytchat/tool/extract/parser.py index a2568a4..2866af2 100644 --- a/pytchat/tool/extract/parser.py +++ b/pytchat/tool/extract/parser.py @@ -42,10 +42,14 @@ def get_offset(item): def get_id(item): - return list((list(item['replayChatItemAction']["actions"][0].values() - )[0])['item'].values())[0].get('id') + a = list(item['replayChatItemAction']["actions"][0].values())[0].get('item') + if a: + return list(a.values())[0].get('id') + return None def get_type(item): - return list((list(item['replayChatItemAction']["actions"][0].values() - )[0])['item'].keys())[0] + a = list(item['replayChatItemAction']["actions"][0].values())[0].get('item') + if a: + return list(a.keys())[0] + return None diff --git a/pytchat/tool/mining/__init__.py b/pytchat/tool/mining/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pytchat/tool/mining/asyncdl.py b/pytchat/tool/mining/asyncdl.py deleted file mode 100644 index 8bb1bc3..0000000 --- a/pytchat/tool/mining/asyncdl.py +++ /dev/null @@ -1,146 +0,0 @@ - -import httpx -import asyncio -import json -from . import parser -from . block import Block -from . worker import ExtractWorker -from . patch import Patch -from ... import config -from ... paramgen import arcparam_mining as arcparam -from concurrent.futures import CancelledError -from urllib.parse import quote - -headers = config.headers -REPLAY_URL = "https://www.youtube.com/live_chat_replay?continuation=" -INTERVAL = 1 - - -def _split(start, end, count, min_interval_sec=120): - """ - Split section from `start` to `end` into `count` pieces, - and returns the beginning of each piece. - The `count` is adjusted so that the length of each piece - is no smaller than `min_interval`. - - Returns: - -------- - List of the offset of each block's first chat data. - """ - - if not (isinstance(start, int) or isinstance(start, float)) or \ - not (isinstance(end, int) or isinstance(end, float)): - raise ValueError("start/end must be int or float") - if not isinstance(count, int): - raise ValueError("count must be int") - if start > end: - raise ValueError("end must be equal to or greater than start.") - if count < 1: - raise ValueError("count must be equal to or greater than 1.") - if (end - start) / count < min_interval_sec: - count = int((end - start) / min_interval_sec) - if count == 0: - count = 1 - interval = (end - start) / count - - if count == 1: - return [start] - return sorted(list(set([int(start + interval * j) - for j in range(count)]))) - - -def ready_blocks(video_id, duration, div, callback): - if div <= 0: - raise ValueError - - async def _get_blocks(video_id, duration, div, callback): - async with httpx.ClientSession() as session: - tasks = [_create_block(session, video_id, seektime, callback) - for seektime in _split(0, duration, div)] - return await asyncio.gather(*tasks) - - async def _create_block(session, video_id, seektime, callback): - continuation = arcparam.getparam(video_id, seektime=seektime) - url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs=" - f"{int(seektime*1000)}&hidden=false&pbj=1") - async with session.get(url, headers=headers) as resp: - chat_json = await resp.text() - if chat_json is None: - return - continuation, actions = parser.parse(json.loads(chat_json)[1]) - first = seektime - seektime += INTERVAL - if callback: - callback(actions, INTERVAL) - return Block( - continuation=continuation, - chat_data=actions, - first=first, - last=seektime, - seektime=seektime - ) - """ - fetch initial blocks. - """ - loop = asyncio.get_event_loop() - blocks = loop.run_until_complete( - _get_blocks(video_id, duration, div, callback)) - return blocks - - -def fetch_patch(callback, blocks, video_id): - - async def _allocate_workers(): - workers = [ - ExtractWorker( - fetch=_fetch, block=block, - blocks=blocks, video_id=video_id - ) - for block in blocks - ] - async with httpx.ClientSession() as session: - tasks = [worker.run(session) for worker in workers] - return await asyncio.gather(*tasks) - - async def _fetch(seektime, session) -> Patch: - continuation = arcparam.getparam(video_id, seektime=seektime) - url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs=" - f"{int(seektime*1000)}&hidden=false&pbj=1") - async with session.get(url, headers=config.headers) as resp: - chat_json = await resp.text() - actions = [] - try: - if chat_json is None: - return Patch() - continuation, actions = parser.parse(json.loads(chat_json)[1]) - except json.JSONDecodeError: - pass - if callback: - callback(actions, INTERVAL) - return Patch(chats=actions, continuation=continuation, - seektime=seektime, last=seektime) - """ - allocate workers and assign blocks. - """ - loop = asyncio.get_event_loop() - try: - loop.run_until_complete(_allocate_workers()) - except CancelledError: - pass - - -async def _shutdown(): - print("\nshutdown...") - tasks = [t for t in asyncio.all_tasks() - if t is not asyncio.current_task()] - for task in tasks: - task.cancel() - try: - await task - except asyncio.CancelledError: - pass - - -def cancel(): - loop = asyncio.get_event_loop() - loop.create_task(_shutdown()) diff --git a/pytchat/tool/mining/block.py b/pytchat/tool/mining/block.py deleted file mode 100644 index 40c95d1..0000000 --- a/pytchat/tool/mining/block.py +++ /dev/null @@ -1,62 +0,0 @@ -from . import parser -class Block: - """Block object represents something like a box - to join chunk of chatdata. - - Parameter: - --------- - first : int : - videoOffsetTimeMs of the first chat_data - (chat_data[0]) - - last : int : - videoOffsetTimeMs of the last chat_data. - (chat_data[-1]) - - this value increases as fetching chatdata progresses. - - end : int : - target videoOffsetTimeMs of last chat data for extract, - equals to first videoOffsetTimeMs of next block. - when extract worker reaches this offset, stop fetching. - - continuation : str : - continuation param of last chat data. - - chat_data : list - - done : bool : - whether this block has been fetched. - - remaining : int : - remaining data to extract. - equals end - last. - - is_last : bool : - whether this block is the last one in blocklist. - - during_split : bool : - whether this block is in the process of during_split. - while True, this block is excluded from duplicate split procedure. - - seektime : float : - the last position of this block(seconds) already fetched. - """ - - __slots__ = ['first','last','end','continuation','chat_data','remaining', - 'done','is_last','during_split','seektime'] - - def __init__(self, first = 0, last = 0, end = 0, - continuation = '', chat_data = [], is_last = False, - during_split = False, seektime = None): - self.first = first - self.last = last - self.end = end - self.continuation = continuation - self.chat_data = chat_data - self.done = False - self.remaining = self.end - self.last - self.is_last = is_last - self.during_split = during_split - self.seektime = seektime - diff --git a/pytchat/tool/mining/parser.py b/pytchat/tool/mining/parser.py deleted file mode 100644 index f9a692f..0000000 --- a/pytchat/tool/mining/parser.py +++ /dev/null @@ -1,73 +0,0 @@ -import re -from ... import config -from ... exceptions import ( - ResponseContextError, - NoContents, NoContinuation) - -logger = config.logger(__name__) - - -def parse(jsn): - """ - Parse replay chat data. - Parameter: - ---------- - jsn : dict - JSON of replay chat data. - Returns: - ------ - continuation : str - actions : list - - """ - if jsn is None: - raise ValueError("parameter JSON is None") - if jsn['response']['responseContext'].get('errors'): - raise ResponseContextError( - 'video_id is invalid or private/deleted.') - contents = jsn["response"].get('continuationContents') - if contents is None: - raise NoContents('No chat data.') - - cont = contents['liveChatContinuation']['continuations'][0] - if cont is None: - raise NoContinuation('No Continuation') - metadata = cont.get('liveChatReplayContinuationData') - if metadata: - continuation = metadata.get("continuation") - actions = contents['liveChatContinuation'].get('actions') - if continuation: - return continuation, [action["replayChatItemAction"]["actions"][0] - for action in actions - if list(action['replayChatItemAction']["actions"][0].values() - )[0]['item'].get("liveChatPaidMessageRenderer") - or list(action['replayChatItemAction']["actions"][0].values() - )[0]['item'].get("liveChatPaidStickerRenderer") - ] - return None, [] - - -def get_offset(item): - return int(item['replayChatItemAction']["videoOffsetTimeMsec"]) - - -def get_id(item): - return list((list(item['replayChatItemAction']["actions"][0].values() - )[0])['item'].values())[0].get('id') - - -def get_type(item): - return list((list(item['replayChatItemAction']["actions"][0].values() - )[0])['item'].keys())[0] - - -_REGEX_YTINIT = re.compile( - "window\\[\"ytInitialData\"\\]\\s*=\\s*({.+?});\\s+") - - -def extract(text): - - match = re.findall(_REGEX_YTINIT, str(text)) - if match: - return match[0] - return None diff --git a/pytchat/tool/mining/patch.py b/pytchat/tool/mining/patch.py deleted file mode 100644 index 7666a52..0000000 --- a/pytchat/tool/mining/patch.py +++ /dev/null @@ -1,27 +0,0 @@ -from . import parser -from . block import Block -from typing import NamedTuple - -class Patch(NamedTuple): - """ - Patch represents chunk of chat data - which is fetched by asyncdl.fetch_patch._fetch(). - """ - chats : list = [] - continuation : str = None - seektime : float = None - first : int = None - last : int = None - -def fill(block:Block, patch:Patch): - if patch.last < block.end: - set_patch(block, patch) - return - block.continuation = None - -def set_patch(block:Block, patch:Patch): - block.continuation = patch.continuation - block.chat_data.extend(patch.chats) - block.last = patch.seektime - block.seektime = patch.seektime - diff --git a/pytchat/tool/mining/superchat_miner.py b/pytchat/tool/mining/superchat_miner.py deleted file mode 100644 index 8a5b3bd..0000000 --- a/pytchat/tool/mining/superchat_miner.py +++ /dev/null @@ -1,72 +0,0 @@ -from . import asyncdl -from . import parser -from .. videoinfo import VideoInfo -from ... import config -from ... exceptions import InvalidVideoIdException -logger = config.logger(__name__) -headers=config.headers - -class SuperChatMiner: - def __init__(self, video_id, duration, div, callback): - if not isinstance(div ,int) or div < 1: - raise ValueError('div must be positive integer.') - elif div > 10: - div = 10 - if not isinstance(duration ,int) or duration < 1: - raise ValueError('duration must be positive integer.') - self.video_id = video_id - self.duration = duration - self.div = div - self.callback = callback - self.blocks = [] - - def _ready_blocks(self): - blocks = asyncdl.ready_blocks( - self.video_id, self.duration, self.div, self.callback) - self.blocks = [block for block in blocks if block is not None] - return self - - def _set_block_end(self): - for i in range(len(self.blocks)-1): - self.blocks[i].end = self.blocks[i+1].first - self.blocks[-1].end = self.duration - self.blocks[-1].is_last =True - return self - - def _download_blocks(self): - asyncdl.fetch_patch(self.callback, self.blocks, self.video_id) - return self - - def _combine(self): - ret = [] - for block in self.blocks: - ret.extend(block.chat_data) - return ret - - def extract(self): - return ( - self._ready_blocks() - ._set_block_end() - ._download_blocks() - ._combine() - ) - -def extract(video_id, div = 1, callback = None, processor = None): - duration = 0 - try: - duration = VideoInfo(video_id).get_duration() - except InvalidVideoIdException: - raise - if duration == 0: - print("video is live.") - return [] - data = SuperChatMiner(video_id, duration, div, callback).extract() - if processor is None: - return data - return processor.process( - [{'video_id':None,'timeout':1,'chatdata' : (action - for action in data)}] - ) - -def cancel(): - asyncdl.cancel() \ No newline at end of file diff --git a/pytchat/tool/mining/worker.py b/pytchat/tool/mining/worker.py deleted file mode 100644 index 3a53e40..0000000 --- a/pytchat/tool/mining/worker.py +++ /dev/null @@ -1,45 +0,0 @@ -from . import parser -from . block import Block -from . patch import Patch, fill -from ... paramgen import arcparam -INTERVAL = 1 -class ExtractWorker: - """ - ExtractWorker associates a download session with a block. - - When the worker finishes fetching, the block - being fetched is splitted and assigned the free worker. - - Parameter - ---------- - fetch : func : - extract function of asyncdl - - block : Block : - Block object that includes chat_data - - blocks : list : - List of Block(s) - - video_id : str : - - parent_block : Block : - the block from which current block is splitted - """ - __slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block'] - def __init__(self, fetch, block, blocks, video_id ): - self.block:Block = block - self.fetch = fetch - self.blocks:list = blocks - self.video_id:str = video_id - self.parent_block:Block = None - - async def run(self, session): - while self.block.continuation: - patch = await self.fetch( - self.block.seektime, session) - fill(self.block, patch) - self.block.seektime += INTERVAL - self.block.done = True - - diff --git a/tests/test_arcparam_mining.py b/tests/test_arcparam_mining.py deleted file mode 100644 index 04e1140..0000000 --- a/tests/test_arcparam_mining.py +++ /dev/null @@ -1,41 +0,0 @@ -from pytchat.tool.mining import parser -import pytchat.config as config -import httpx -import json -from pytchat.paramgen import arcparam_mining as arcparam - - -def test_arcparam_e(mocker): - try: - arcparam.getparam("01234567890", -1) - assert False - except ValueError: - assert True - - -def test_arcparam_0(mocker): - param = arcparam.getparam("01234567890", 0) - - assert param == "op2w0wQsGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABYARyAggBeAE%3D" - - -def test_arcparam_1(mocker): - param = arcparam.getparam("01234567890", seektime=100000) - print(param) - assert param == "op2w0wQzGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABWgUQgMLXL2AEcgIIAXgB" - - -def test_arcparam_2(mocker): - param = arcparam.getparam("PZz9NB0-Z64", 1) - url = f"https://www.youtube.com/live_chat_replay?continuation={param}&playerOffsetMs=1000&pbj=1" - resp = httpx.Client(http2=True).get(url, headers=config.headers) - jsn = json.loads(resp.text) - _, chatdata = parser.parse(jsn[1]) - test_id = chatdata[0]["addChatItemAction"]["item"]["liveChatPaidMessageRenderer"]["id"] - print(test_id) - assert test_id == "ChwKGkNKSGE0YnFJeWVBQ0ZWcUF3Z0VkdGIwRm9R" - - -def test_arcparam_3(mocker): - param = arcparam.getparam("01234567890") - assert param == "op2w0wQsGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABYARyAggBeAE%3D"