Merge branch 'hotfix/filepath'

Increment version
Make sure to pass fixed filepath to processor
2020-10-04 11:33:58 +09:00 · 2020-10-04 11:30:07 +09:00 · 2020-10-04 11:29:52 +09:00 · 2020-10-04 10:32:53 +09:00 · 2020-10-04 10:22:34 +09:00 · 2020-10-04 10:20:14 +09:00
9 changed files with 135 additions and 38 deletions
--- a/pytchat/init.py
+++ b/pytchat/init.py
@@ -2,7 +2,7 @@
 pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
 """
 __copyright__    = 'Copyright (C) 2019 taizan-hokuto'
-__version__      = '0.2.1'
+__version__      = '0.3.0'
 __license__      = 'MIT'
 __author__       = 'taizan-hokuto'
 __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'
--- a/pytchat/cli/init.py
+++ b/pytchat/cli/init.py
@@ -2,11 +2,13 @@ import argparse

 import os
 import signal
+import time
 from json.decoder import JSONDecodeError
 from pathlib import Path
+from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
 from .arguments import Arguments
 from .progressbar import ProgressBar
-from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
+from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
 from .. processors.html_archiver import HTMLArchiver
 from .. tool.extract.extractor import Extractor
 from .. tool.videoinfo import VideoInfo
@@ -49,21 +51,36 @@ def main():
    for counter, video_id in enumerate(Arguments().video_ids):
        if '[' in video_id:
            video_id = video_id.replace('[', '').replace(']', '')
+        if len(Arguments().video_ids) > 1:
+            print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
+
        try:
            video_id = extract_video_id(video_id)
-            if os.path.exists(Arguments().output):
-                path = Path(Arguments().output + video_id + '.html')
-            else:
+            if not os.path.exists(Arguments().output):
                raise FileNotFoundError
-            info = VideoInfo(video_id)
-            if len(Arguments().video_ids) > 1:
-                print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
+            separated_path = str(Path(Arguments().output)) + os.path.sep
+            path = util.checkpath(separated_path + video_id + '.html')
+            err = None
+            for _ in range(3):  # retry 3 times
+                try:
+                    info = VideoInfo(video_id)
+                    break
+                except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
+                    err = e
+                    time.sleep(2)
+                    continue
+            else:
+                print("Cannot parse video information.:{}".format(video_id))
+                if Arguments().save_error_data:
+                    util.save(err.doc, "ERR", ".dat")
+                continue
+
            print(f"\n"
                  f" video_id: {video_id}\n"
                  f" channel:  {info.get_channel_name()}\n"
                  f" title:    {info.get_title()}")

-            print(f" output path: {path.resolve()}")
+            print(f" output path: {path}")
            duration = info.get_duration()
            pbar = ProgressBar(total=(duration * 1000), status="Extracting")
            ex = Extractor(video_id,
@@ -74,7 +91,7 @@ def main():
            if data == []:
                return False
            pbar.reset("#", "=", total=len(data), status="Rendering  ")
-            processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
+            processor = HTMLArchiver(path, callback=pbar._disp)
            processor.process(
                [{'video_id': None,
                'timeout': 1,
@@ -86,8 +103,6 @@ def main():
            print()
            if pbar.is_cancelled():
                print("\nThe extraction process has been discontinued.\n")
-
-
        except InvalidVideoIdException:
            print("Invalid Video ID or URL:", video_id)
        except NoContents as e:
@@ -96,14 +111,15 @@ def main():
            print("The specified directory does not exist.:{}".format(Arguments().output))
        except JSONDecodeError as e:
            print(e.msg)
-            print("Cannot parse video information.:{}".format(video_id))
+            print("JSONDecodeError.:{}".format(video_id))
            if Arguments().save_error_data:
                util.save(e.doc, "ERR_JSON_DECODE", ".dat")
-        except PatternUnmatchError as e:
-            print(e.msg)
-            print("Cannot parse video information.:{}".format(video_id))
-            if Arguments().save_error_data:
-                util.save(e.doc, "ERR_PATTERN_UNMATCH", ".dat")
+        except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
+            print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
+        except PatternUnmatchError:
+            print(f"PatternUnmatchError [{video_id}]. ")
+        except Exception as e:
+            print(type(e), str(e))

    return

--- a/pytchat/exceptions.py
+++ b/pytchat/exceptions.py
@@ -38,7 +38,9 @@ class InvalidVideoIdException(Exception):
    '''
    Thrown when the video_id is not exist (VideoInfo).
    '''
-    pass
+    def __init__(self, doc):
+        self.msg = "InvalidVideoIdException"
+        self.doc = doc


 class UnknownConnectionError(Exception):
@@ -47,7 +49,7 @@ class UnknownConnectionError(Exception):

 class RetryExceedMaxCount(Exception):
    '''
-    thrown when the number of retries exceeds the maximum value.
+    Thrown when the number of retries exceeds the maximum value.
    '''
    pass

@@ -66,13 +68,13 @@ class FailedExtractContinuation(ChatDataFinished):

 class VideoInfoParseError(Exception):
    '''
-    thrown when failed to parse video info
+    Base exception when parsing video info.
    '''


 class PatternUnmatchError(VideoInfoParseError):
    '''
-    thrown when failed to parse video info with unmatched pattern
+    Thrown when failed to parse video info with unmatched pattern.
    '''
    def __init__(self, doc):
        self.msg = "PatternUnmatchError"
--- a/pytchat/processors/html_archiver.py
+++ b/pytchat/processors/html_archiver.py
@@ -1,9 +1,12 @@
+import httpx
 import os
 import re
-import httpx
+import time
 from base64 import standard_b64encode
+from httpx import NetworkError, ReadTimeout
 from .chat_processor import ChatProcessor
 from .default.processor import DefaultProcessor
+from ..exceptions import UnknownConnectionError


 PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
@@ -112,7 +115,18 @@ class HTMLArchiver(ChatProcessor):
                       for item in message_items)

    def _encode_img(self, url):
-        resp = httpx.get(url)
+        err = None
+        for _ in range(5):
+            try:
+                resp = httpx.get(url, timeout=30)
+                break
+            except (NetworkError, ReadTimeout) as e:
+                print("Network Error. retrying...")
+                err = e
+                time.sleep(3)
+        else:
+            raise UnknownConnectionError(str(err))
+
        return standard_b64encode(resp.content).decode()

    def _set_emoji_table(self, item: dict):
--- a/pytchat/tool/extract/asyncdl.py
+++ b/pytchat/tool/extract/asyncdl.py
@@ -8,14 +8,19 @@ from ... import config
 from ... paramgen import arcparam
 from ... exceptions import UnknownConnectionError
 from concurrent.futures import CancelledError
+from httpx import NetworkError, ReadTimeout
 from json import JSONDecodeError
 from urllib.parse import quote

+
 headers = config.headers
 REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
             "get_live_chat_replay?continuation="
 MAX_RETRY_COUNT = 3

+# Set to avoid duplicate parameters
+param_set = set()
+

 def _split(start, end, count, min_interval_sec=120):
    """
@@ -50,6 +55,7 @@ def _split(start, end, count, min_interval_sec=120):


 def ready_blocks(video_id, duration, div, callback):
+    param_set.clear()
    if div <= 0:
        raise ValueError

@@ -62,16 +68,24 @@ def ready_blocks(video_id, duration, div, callback):
    async def _create_block(session, video_id, seektime, callback):
        continuation = arcparam.getparam(video_id, seektime=seektime)
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
+        err = None
        for _ in range(MAX_RETRY_COUNT):
            try:
+                if continuation in param_set:
+                    next_continuation, actions = None, []
+                    break
+                param_set.add(continuation)
                resp = await session.get(url, headers=headers)
                next_continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
+            except (NetworkError, ReadTimeout) as e:
+                err = e
+                await asyncio.sleep(3)
        else:
            cancel()
-            raise UnknownConnectionError("Abort: Unknown connection error.")
+            raise UnknownConnectionError("Abort:" + str(err))

        if actions:
            first = parser.get_offset(actions[0])
@@ -110,16 +124,24 @@ def fetch_patch(callback, blocks, video_id):

    async def _fetch(continuation, session) -> Patch:
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
+        err = None
        for _ in range(MAX_RETRY_COUNT):
            try:
+                if continuation in param_set:
+                    continuation, actions = None, []
+                    break
+                param_set.add(continuation)
                resp = await session.get(url, headers=config.headers)
                continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
+            except (NetworkError, ReadTimeout) as e:
+                err = e
+                await asyncio.sleep(3)
        else:
            cancel()
-            raise UnknownConnectionError("Abort: Unknown connection error.")
+            raise UnknownConnectionError("Abort:" + str(err))

        if actions:
            last = parser.get_offset(actions[-1])
--- a/pytchat/tool/extract/extractor.py
+++ b/pytchat/tool/extract/extractor.py
@@ -93,4 +93,5 @@ class Extractor:
        return ret

    def cancel(self):
+        print("cancel")
        asyncdl.cancel()
--- a/pytchat/tool/extract/worker.py
+++ b/pytchat/tool/extract/worker.py
@@ -7,7 +7,6 @@ from typing import Tuple
 class ExtractWorker:
    """
    ExtractWorker associates a download session with a block.
-
    When the worker finishes fetching, the block
    being fetched is splitted and assigned the free worker.

--- a/pytchat/tool/videoinfo.py
+++ b/pytchat/tool/videoinfo.py
@@ -1,13 +1,16 @@
+import httpx
 import json
 import re
-import httpx
+import time
+from httpx import ConnectError, NetworkError
 from .. import config
-from ..exceptions import InvalidVideoIdException, PatternUnmatchError
+from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
 from ..util.extract_video_id import extract_video_id

+
 headers = config.headers
                         
-pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})")
+pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})")

 item_channel_id = [
    "videoDetails",
@@ -80,19 +83,37 @@ class VideoInfo:

    def __init__(self, video_id):
        self.video_id = extract_video_id(video_id)
-        text = self._get_page_text(self.video_id)
-        self._parse(text)
+        for _ in range(3):
+            try:
+                text = self._get_page_text(self.video_id)
+                self._parse(text)
+                break
+            except PatternUnmatchError:
+                time.sleep(2)
+                pass
+        else:
+            raise PatternUnmatchError("Pattern Unmatch")

    def _get_page_text(self, video_id):
        url = f"https://www.youtube.com/embed/{video_id}"
-        resp = httpx.get(url, headers=headers)
-        resp.raise_for_status()
+        err = None
+        for _ in range(3):
+            try:
+                resp = httpx.get(url, headers=headers)
+                resp.raise_for_status()
+                break
+            except (ConnectError, NetworkError) as e:
+                err = e
+                time.sleep(3)
+        else:
+            raise UnknownConnectionError(str(err))
+
        return resp.text

    def _parse(self, text):
        result = re.search(pattern, text)
        if result is None:
-            raise PatternUnmatchError(text)
+            raise PatternUnmatchError()
        decoder = json.JSONDecoder()
        res = decoder.raw_decode(result.group(1)[:-1])[0]
        response = self._get_item(res, item_response)
--- a/pytchat/util/init.py
+++ b/pytchat/util/init.py
@@ -1,8 +1,12 @@
+import datetime
 import httpx
 import json
-import datetime
+import os
+import re
 from .. import config

+PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
+

 def extract(url):
    _session = httpx.Client(http2=True)
@@ -16,3 +20,21 @@ def save(data, filename, extention):
    with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention,
              mode='w', encoding='utf-8') as f:
        f.writelines(data)
+
+
+def checkpath(filepath):
+    splitter = os.path.splitext(os.path.basename(filepath))
+    body = splitter[0]
+    extention = splitter[1]
+    newpath = filepath
+    counter = 1
+    while os.path.exists(newpath):
+        match = re.search(PATTERN, body)
+        if match:
+            counter = int(match[2]) + 1
+            num_with_bracket = f'({str(counter)})'
+            body = f'{match[1]}{num_with_bracket}'
+        else:
+            body = f'{body}({str(counter)})'
+        newpath = os.path.join(os.path.dirname(filepath), body + extention)
+    return newpath
Author	SHA1	Message	Date
taizan-hokouto	3106b3e545	Merge branch 'hotfix/filepath'	2020-10-04 11:33:58 +09:00
taizan-hokouto	50816a661d	Increment version	2020-10-04 11:30:07 +09:00
taizan-hokouto	6755bc8bb2	Make sure to pass fixed filepath to processor	2020-10-04 11:29:52 +09:00
taizan-hokouto	26be989b9b	Merge branch 'hotfix/fix'	2020-10-04 10:32:53 +09:00
taizan-hokouto	73ad0a1f44	Increment version	2020-10-04 10:22:34 +09:00
taizan-hokouto	66b185ebf7	Fix constructing filepath	2020-10-04 10:20:14 +09:00
taizan_hokuto	71650c39f7	Merge branch 'hotfix/fix'	2020-10-03 22:42:48 +09:00
taizan_hokuto	488445c73b	Increment version	2020-10-03 22:41:53 +09:00
taizan_hokuto	075e811efe	Delete unnecessary code	2020-10-03 22:41:12 +09:00
taizan_hokuto	58d9bf7fdb	Merge branch 'hotfix/pattern'	2020-10-03 22:35:46 +09:00
taizan_hokuto	b3e6275de7	Increment version	2020-10-03 22:35:22 +09:00
taizan_hokuto	748778f545	Fix pattern matching	2020-10-03 22:04:09 +09:00
taizan-hokuto	e29b3b8377	Merge branch 'hotfix/network'	2020-09-14 00:40:40 +09:00
taizan-hokuto	0859ed5fb1	Increment version	2020-09-14 00:29:21 +09:00
taizan-hokuto	a80d5ba080	Fix handling network error	2020-09-14 00:28:41 +09:00
taizan-hokuto	b7e6043a71	Merge branch 'hotfix/memory'	2020-09-12 02:12:46 +09:00
taizan-hokuto	820ba35013	Increment version	2020-09-12 02:02:07 +09:00
taizan-hokuto	ecd2d130bf	Clear set each time the extraction changes	2020-09-12 01:57:55 +09:00
taizan-hokuto	f77a2c889b	Merge branch 'hotfix/not_quit'	2020-09-12 00:57:48 +09:00
taizan-hokuto	47d5ab288f	Increment version	2020-09-12 00:49:37 +09:00
taizan-hokuto	5f53fd24dd	Format	2020-09-12 00:48:40 +09:00
taizan-hokuto	11a9d0e2d7	Fix a problem with extraction not completing	2020-09-12 00:42:30 +09:00
taizan-hokuto	480c9e15b8	Merge branch 'hotfix/continue_error'	2020-09-11 00:21:07 +09:00
taizan-hokuto	35aa7636f6	Increment version	2020-09-11 00:20:24 +09:00
taizan-hokuto	8fee67c2d4	Fix handling video info error	2020-09-11 00:18:09 +09:00