Merge branch 'hotfix/fix_json'

Increment version
Fix import module
2020-10-06 01:30:15 +09:00 · 2020-10-06 01:24:31 +09:00 · 2020-10-06 01:24:04 +09:00 · 2020-10-06 01:20:25 +09:00 · 2020-10-06 01:19:45 +09:00 · 2020-10-05 21:38:51 +09:00
10 changed files with 219 additions and 101 deletions
--- a/pytchat/init.py
+++ b/pytchat/init.py
@@ -2,7 +2,7 @@
 pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
 """
 __copyright__    = 'Copyright (C) 2019 taizan-hokuto'
-__version__      = '0.2.2'
+__version__      = '0.3.2'
 __license__      = 'MIT'
 __author__       = 'taizan-hokuto'
 __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'
--- a/pytchat/cli/init.py
+++ b/pytchat/cli/init.py
@@ -1,13 +1,17 @@
 import argparse
-
+import asyncio
 try:
    from asyncio import CancelledError
 except ImportError:
    from asyncio.futures import CancelledError
 import os
 import signal
 import time
 from json.decoder import JSONDecodeError
 from pathlib import Path
 from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
 from .arguments import Arguments
 from .progressbar import ProgressBar
-from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
+from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
 from .. processors.html_archiver import HTMLArchiver
 from .. tool.extract.extractor import Extractor
 from .. tool.videoinfo import VideoInfo
@@ -37,6 +41,7 @@ def main():
                        help='Save error data when error occurs(".dat" file)')
    parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
                        help='Show version')
    Arguments(parser.parse_args().__dict__)
    if Arguments().print_version:
@@ -47,75 +52,106 @@ def main():
    if not Arguments().video_ids:
        parser.print_help()
        return
    for counter, video_id in enumerate(Arguments().video_ids):
        if '[' in video_id:
            video_id = video_id.replace('[', '').replace(']', '')
        try:
            video_id = extract_video_id(video_id)
            if os.path.exists(Arguments().output):
                path = Path(Arguments().output + video_id + '.html')
            else:
                raise FileNotFoundError
            err = None
            for _ in range(3): # retry 3 times
                try:                
                    info = VideoInfo(video_id)
                    break
                except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
                    err = e
                    time.sleep(2)
                    continue
            else:
                print("Cannot parse video information.:{}".format(video_id))
                if Arguments().save_error_data:
                    util.save(err.doc, "ERR", ".dat")
                continue
    if not os.path.exists(Arguments().output):
        print("\nThe specified directory does not exist.:{}\n".format(Arguments().output))
        return
    try:
        Runner().run()
    except CancelledError as e:
        print(str(e))
 class Runner:
    def run(self) -> None:
        ex = None
        pbar = None
        for counter, video_id in enumerate(Arguments().video_ids):
            if len(Arguments().video_ids) > 1:
                print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
            print(f"\n"
                  f" video_id: {video_id}\n"
                  f" channel:  {info.get_channel_name()}\n"
                  f" title:    {info.get_title()}")
-            print(f" output path: {path.resolve()}")
+            try:
-            duration = info.get_duration()
+                video_id = extract_video_id(video_id)
-            pbar = ProgressBar(total=(duration * 1000), status="Extracting")
+                separated_path = str(Path(Arguments().output)) + os.path.sep
-            ex = Extractor(video_id,               
+                path = util.checkpath(separated_path + video_id + '.html')
-                    callback=pbar._disp,
+                try:
-                    div=10)
+                    info = VideoInfo(video_id)
-            signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
+                except Exception as e:
-            data = ex.extract()
+                    print("Cannot parse video information.:{} {}".format(video_id, type(e)))
-            if data == []:
+                    if Arguments().save_error_data:
-                return False
+                        util.save(str(e), "ERR", ".dat")
-            pbar.reset("#", "=", total=len(data), status="Rendering  ")
+                    continue
            processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
            processor.process(
                [{'video_id': None,
                'timeout': 1,
                'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
            )
            processor.finalize()
            pbar.reset('#', '#', status='Completed   ')
            pbar.close()
            print()
            if pbar.is_cancelled():
                print("\nThe extraction process has been discontinued.\n")
        except InvalidVideoIdException:
            print("Invalid Video ID or URL:", video_id)
        except NoContents as e:
            print(e)
        except FileNotFoundError:
            print("The specified directory does not exist.:{}".format(Arguments().output))
        except JSONDecodeError as e:
            print(e.msg)
            print("JSONDecodeError.:{}".format(video_id))
            if Arguments().save_error_data:
                util.save(e.doc, "ERR_JSON_DECODE", ".dat")
-    return
+                print(f"\n"
                    f" video_id: {video_id}\n"
                    f" channel:  {info.get_channel_name()}\n"
                    f" title:    {info.get_title()}\n"
                    f" output path: {path}")
                duration = info.get_duration()
                pbar = ProgressBar(total=(duration * 1000), status_txt="Extracting")
                ex = Extractor(video_id,
                        callback=pbar.disp,
                        div=10)
                signal.signal(signal.SIGINT, (lambda a, b: self.cancel(ex, pbar)))
                data = ex.extract()
                if data == []:
                    continue
                pbar.reset("#", "=", total=len(data), status_txt="Rendering  ")
                processor = HTMLArchiver(path, callback=pbar.disp)
                processor.process(
                    [{'video_id': None,
                    'timeout': 1,
                    'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
                )
                processor.finalize()
                pbar.reset('#', '#', status_txt='Completed   ')
                pbar.close()
                print()
                if pbar.is_cancelled():
                    print("\nThe extraction process has been discontinued.\n")
            except InvalidVideoIdException:
                print("Invalid Video ID or URL:", video_id)
            except NoContents as e:
                print(f"Abort:{str(e)}:[{video_id}]")
            except (JSONDecodeError, PatternUnmatchError) as e:
                print("{}:{}".format(e.msg, video_id))
                if Arguments().save_error_data:
                    util.save(e.doc, "ERR_", ".dat")
            except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
                print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
            except Exception as e:
                print(f"Abort:{str(type(e))} {str(e)[:80]}")
            finally:
                clear_tasks()
        return
    def cancel(self, ex=None, pbar=None) -> None:
        '''Called when keyboard interrupted has occurred.
        '''
        print("\nKeyboard interrupted.\n")
        if ex and pbar:
            ex.cancel()
            pbar.cancel()
-def cancel(ex, pbar):
+def clear_tasks():
-    ex.cancel()
+    '''
-    pbar.cancel()
+    Clear remained tasks.
    Called when internal exception has occurred or
    after each extraction process is completed.
    '''
    async def _shutdown():
        tasks = [t for t in asyncio.all_tasks()
                if t is not asyncio.current_task()]
        for task in tasks:
            task.cancel()
    try:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(_shutdown())
    except Exception as e:
        print(e)
--- a/pytchat/cli/progressbar.py
+++ b/pytchat/cli/progressbar.py
@@ -9,21 +9,20 @@ import sys
 class ProgressBar:
-    def __init__(self, total, status):
+    def __init__(self, total, status_txt):
        self._bar_len = 60
        self._cancelled = False
-        self.reset(total=total, status=status)
+        self.reset(total=total, status_txt=status_txt)
        self._blinker = 0
-    def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''):
+    def reset(self, symbol_done="=", symbol_space=" ", total=100, status_txt=''):
-        self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns
+        self._console_width = shutil.get_terminal_size(fallback=(80, 24)).columns
        self._symbol_done = symbol_done
        self._symbol_space = symbol_space
        self._total = total
-        self._status = status
+        self._status_txt = status_txt
        self._count = 0
-    def _disp(self, _, fetched):
+    def disp(self, _, fetched):
        self._progress(fetched, self._total)
    def _progress(self, fillin, total):
@@ -39,11 +38,10 @@ class ProgressBar:
        bar = self._symbol_done * filled_len + \
              self._symbol_space * (self._bar_len - filled_len)
-        disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r'
+        disp = f" [{bar}] {percents:>5.1f}% ...{self._status_txt} "[:self._console_width - 1] + '\r'
        sys.stdout.write(disp)
        sys.stdout.flush()
        self._blinker += 1
    def close(self):
        if not self._cancelled:
--- a/pytchat/exceptions.py
+++ b/pytchat/exceptions.py
@@ -43,7 +43,6 @@ class InvalidVideoIdException(Exception):
        self.doc = doc
 class UnknownConnectionError(Exception):
    pass
--- a/pytchat/processors/html_archiver.py
+++ b/pytchat/processors/html_archiver.py
@@ -1,9 +1,12 @@
 import httpx
 import os
 import re
-import httpx
+import time
 from base64 import standard_b64encode
 from httpx import NetworkError, ReadTimeout
 from .chat_processor import ChatProcessor
 from .default.processor import DefaultProcessor
 from ..exceptions import UnknownConnectionError
 PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
@@ -112,7 +115,18 @@ class HTMLArchiver(ChatProcessor):
                       for item in message_items)
    def _encode_img(self, url):
-        resp = httpx.get(url)
+        err = None
        for _ in range(5):
            try:
                resp = httpx.get(url, timeout=30)
                break
            except (NetworkError, ReadTimeout) as e:
                print("Network Error. retrying...")
                err = e
                time.sleep(3)
        else:
            raise UnknownConnectionError(str(err))
        return standard_b64encode(resp.content).decode()
    def _set_emoji_table(self, item: dict):
--- a/pytchat/tool/extract/asyncdl.py
+++ b/pytchat/tool/extract/asyncdl.py
@@ -1,5 +1,6 @@
 import httpx
 import asyncio
 import httpx
 import socket
 from . import parser
 from . block import Block
 from . worker import ExtractWorker
@@ -8,14 +9,19 @@ from ... import config
 from ... paramgen import arcparam
 from ... exceptions import UnknownConnectionError
 from concurrent.futures import CancelledError
 from httpx import NetworkError, TimeoutException, ConnectError
 from json import JSONDecodeError
 from urllib.parse import quote
 headers = config.headers
 REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
             "get_live_chat_replay?continuation="
 MAX_RETRY_COUNT = 3
 # Set to avoid duplicate parameters
 param_set = set()
 def _split(start, end, count, min_interval_sec=120):
    """
@@ -50,6 +56,7 @@ def _split(start, end, count, min_interval_sec=120):
 def ready_blocks(video_id, duration, div, callback):
    param_set.clear()
    if div <= 0:
        raise ValueError
@@ -62,16 +69,24 @@ def ready_blocks(video_id, duration, div, callback):
    async def _create_block(session, video_id, seektime, callback):
        continuation = arcparam.getparam(video_id, seektime=seektime)
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
        err = None
        for _ in range(MAX_RETRY_COUNT):
            try:
-                resp = await session.get(url, headers=headers)
+                if continuation in param_set:
                    next_continuation, actions = None, []
                    break
                param_set.add(continuation)
                resp = await session.get(url, headers=headers, timeout=10)
                next_continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
            except (NetworkError, TimeoutException, ConnectError) as e:
                err = e
                await asyncio.sleep(3)
        else:
            cancel()
-            raise UnknownConnectionError("Abort: Unknown connection error.")
+            raise UnknownConnectionError("Abort:" + str(err))
        if actions:
            first = parser.get_offset(actions[0])
@@ -110,16 +125,27 @@ def fetch_patch(callback, blocks, video_id):
    async def _fetch(continuation, session) -> Patch:
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
        err = None
        for _ in range(MAX_RETRY_COUNT):
            try:
                if continuation in param_set:
                    continuation, actions = None, []
                    break
                param_set.add(continuation)
                resp = await session.get(url, headers=config.headers)
                continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
            except (NetworkError, TimeoutException, ConnectError) as e:
                err = e
                await asyncio.sleep(3)
            except socket.error as error:
                print("socket error", error.errno)
                await asyncio.sleep(3)
        else:
            cancel()
-            raise UnknownConnectionError("Abort: Unknown connection error.")
+            raise UnknownConnectionError("Abort:" + str(err))
        if actions:
            last = parser.get_offset(actions[-1])
@@ -140,15 +166,10 @@ def fetch_patch(callback, blocks, video_id):
 async def _shutdown():
    print("\nshutdown...")
    tasks = [t for t in asyncio.all_tasks()
             if t is not asyncio.current_task()]
    for task in tasks:
        task.cancel()
        try:
            await task
        except asyncio.CancelledError:
            pass
 def cancel():
--- a/pytchat/tool/extract/worker.py
+++ b/pytchat/tool/extract/worker.py
@@ -7,7 +7,6 @@ from typing import Tuple
 class ExtractWorker:
    """
    ExtractWorker associates a download session with a block.
    When the worker finishes fetching, the block
    being fetched is splitted and assigned the free worker.
--- a/pytchat/tool/videoinfo.py
+++ b/pytchat/tool/videoinfo.py
@@ -1,13 +1,16 @@
 import httpx
 import json
 import re
-import httpx
+import time
 from httpx import ConnectError, NetworkError, TimeoutException
 from .. import config
-from ..exceptions import InvalidVideoIdException, PatternUnmatchError
+from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
 from ..util.extract_video_id import extract_video_id
 headers = config.headers
-pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})")
+headers = config.headers
 pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})")
 item_channel_id = [
    "videoDetails",
@@ -80,19 +83,42 @@ class VideoInfo:
    def __init__(self, video_id):
        self.video_id = extract_video_id(video_id)
-        text = self._get_page_text(self.video_id)
+        err = None
-        self._parse(text)
+        for _ in range(3):
            try:
                text = self._get_page_text(self.video_id)
                self._parse(text)
                break
            except (InvalidVideoIdException, UnknownConnectionError) as e:
                print(str(e))
                raise e
            except Exception as e:
                err = e
                time.sleep(2)
                pass
        else:
            raise err
    def _get_page_text(self, video_id):
        url = f"https://www.youtube.com/embed/{video_id}"
-        resp = httpx.get(url, headers=headers)
+        err = None
-        resp.raise_for_status()
+        for _ in range(3):
            try:
                resp = httpx.get(url, headers=headers)
                resp.raise_for_status()
                break
            except (ConnectError, NetworkError, TimeoutException) as e:
                err = e
                time.sleep(3)
        else:
            raise UnknownConnectionError(str(err))
        return resp.text
    def _parse(self, text):
        result = re.search(pattern, text)
        if result is None:
-            raise PatternUnmatchError(text)
+            raise PatternUnmatchError(doc=text)
        decoder = json.JSONDecoder()
        res = decoder.raw_decode(result.group(1)[:-1])[0]
        response = self._get_item(res, item_response)
--- a/pytchat/util/init.py
+++ b/pytchat/util/init.py
@@ -1,8 +1,12 @@
 import datetime
 import httpx
 import json
-import datetime
+import os
 import re
 from .. import config
 PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
 def extract(url):
    _session = httpx.Client(http2=True)
@@ -16,3 +20,21 @@ def save(data, filename, extention):
    with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention,
              mode='w', encoding='utf-8') as f:
        f.writelines(data)
 def checkpath(filepath):
    splitter = os.path.splitext(os.path.basename(filepath))
    body = splitter[0]
    extention = splitter[1]
    newpath = filepath
    counter = 1
    while os.path.exists(newpath):
        match = re.search(PATTERN, body)
        if match:
            counter = int(match[2]) + 1
            num_with_bracket = f'({str(counter)})'
            body = f'{match[1]}{num_with_bracket}'
        else:
            body = f'{body}({str(counter)})'
        newpath = os.path.join(os.path.dirname(filepath), body + extention)
    return newpath
--- a/pytchat/util/extract_video_id.py
+++ b/pytchat/util/extract_video_id.py
@@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11
 def extract_video_id(url_or_id: str) -> str:
    ret = ''
    if '[' in url_or_id:
        url_or_id = url_or_id.replace('[', '').replace(']', '')
    if type(url_or_id) != str:
        raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
    if len(url_or_id) == YT_VIDEO_ID_LENGTH:
Author	SHA1	Message	Date
taizan-hokouto	39d99ad4af	Merge branch 'hotfix/fix_json'	2020-10-06 01:30:15 +09:00
taizan-hokouto	3675c91240	Increment version	2020-10-06 01:24:31 +09:00
taizan-hokouto	46258f625a	Fix import module	2020-10-06 01:24:04 +09:00
taizan-hokouto	2cc161b589	Increment version	2020-10-06 01:20:25 +09:00
taizan-hokouto	115277e5e1	Fix handling internal error and keyboard interrupt	2020-10-06 01:19:45 +09:00
taizan-hokouto	ebf0e7c181	Fix handling json decode error and pattern unmatch	2020-10-05 21:38:51 +09:00
taizan-hokouto	3106b3e545	Merge branch 'hotfix/filepath'	2020-10-04 11:33:58 +09:00
taizan-hokouto	50816a661d	Increment version	2020-10-04 11:30:07 +09:00
taizan-hokouto	6755bc8bb2	Make sure to pass fixed filepath to processor	2020-10-04 11:29:52 +09:00
taizan-hokouto	26be989b9b	Merge branch 'hotfix/fix'	2020-10-04 10:32:53 +09:00
taizan-hokouto	73ad0a1f44	Increment version	2020-10-04 10:22:34 +09:00
taizan-hokouto	66b185ebf7	Fix constructing filepath	2020-10-04 10:20:14 +09:00
taizan_hokuto	71650c39f7	Merge branch 'hotfix/fix'	2020-10-03 22:42:48 +09:00
taizan_hokuto	488445c73b	Increment version	2020-10-03 22:41:53 +09:00
taizan_hokuto	075e811efe	Delete unnecessary code	2020-10-03 22:41:12 +09:00
taizan_hokuto	58d9bf7fdb	Merge branch 'hotfix/pattern'	2020-10-03 22:35:46 +09:00
taizan_hokuto	b3e6275de7	Increment version	2020-10-03 22:35:22 +09:00
taizan_hokuto	748778f545	Fix pattern matching	2020-10-03 22:04:09 +09:00
taizan-hokuto	e29b3b8377	Merge branch 'hotfix/network'	2020-09-14 00:40:40 +09:00
taizan-hokuto	0859ed5fb1	Increment version	2020-09-14 00:29:21 +09:00
taizan-hokuto	a80d5ba080	Fix handling network error	2020-09-14 00:28:41 +09:00
taizan-hokuto	b7e6043a71	Merge branch 'hotfix/memory'	2020-09-12 02:12:46 +09:00
taizan-hokuto	820ba35013	Increment version	2020-09-12 02:02:07 +09:00
taizan-hokuto	ecd2d130bf	Clear set each time the extraction changes	2020-09-12 01:57:55 +09:00
taizan-hokuto	f77a2c889b	Merge branch 'hotfix/not_quit'	2020-09-12 00:57:48 +09:00
taizan-hokuto	47d5ab288f	Increment version	2020-09-12 00:49:37 +09:00
taizan-hokuto	5f53fd24dd	Format	2020-09-12 00:48:40 +09:00
taizan-hokuto	11a9d0e2d7	Fix a problem with extraction not completing	2020-09-12 00:42:30 +09:00