From ebf0e7c1813f60a01a86f0cd18aff30eabe406c2 Mon Sep 17 00:00:00 2001 From: taizan-hokouto <55448286+taizan-hokuto@users.noreply.github.com> Date: Mon, 5 Oct 2020 21:38:51 +0900 Subject: [PATCH] Fix handling json decode error and pattern unmatch --- pytchat/cli/__init__.py | 43 ++++++++++++-------------------- pytchat/tool/videoinfo.py | 15 +++++++---- pytchat/util/extract_video_id.py | 3 +++ 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/pytchat/cli/__init__.py b/pytchat/cli/__init__.py index 97a6442..b88f95a 100644 --- a/pytchat/cli/__init__.py +++ b/pytchat/cli/__init__.py @@ -2,7 +2,6 @@ import argparse import os import signal -import time from json.decoder import JSONDecodeError from pathlib import Path from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError @@ -38,6 +37,7 @@ def main(): help='Save error data when error occurs(".dat" file)') parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', help='Show version') + Arguments(parser.parse_args().__dict__) if Arguments().print_version: @@ -48,39 +48,33 @@ def main(): if not Arguments().video_ids: parser.print_help() return + + if not os.path.exists(Arguments().output): + print("\nThe specified directory does not exist.:{}\n".format(Arguments().output)) + return + for counter, video_id in enumerate(Arguments().video_ids): - if '[' in video_id: - video_id = video_id.replace('[', '').replace(']', '') if len(Arguments().video_ids) > 1: print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}") try: video_id = extract_video_id(video_id) - if not os.path.exists(Arguments().output): - raise FileNotFoundError separated_path = str(Path(Arguments().output)) + os.path.sep path = util.checkpath(separated_path + video_id + '.html') - err = None - for _ in range(3): # retry 3 times - try: - info = VideoInfo(video_id) - break - except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e: - err = e - time.sleep(2) - continue - else: - print("Cannot parse video information.:{}".format(video_id)) + try: + info = VideoInfo(video_id) + except Exception as e: + print("Cannot parse video information.:{} {}".format(video_id, type(e))) if Arguments().save_error_data: - util.save(err.doc, "ERR", ".dat") + util.save(str(e), "ERR", ".dat") continue print(f"\n" f" video_id: {video_id}\n" f" channel: {info.get_channel_name()}\n" - f" title: {info.get_title()}") + f" title: {info.get_title()}\n" + f" output path: {path}") - print(f" output path: {path}") duration = info.get_duration() pbar = ProgressBar(total=(duration * 1000), status="Extracting") ex = Extractor(video_id, @@ -107,17 +101,12 @@ def main(): print("Invalid Video ID or URL:", video_id) except NoContents as e: print(e) - except FileNotFoundError: - print("The specified directory does not exist.:{}".format(Arguments().output)) - except JSONDecodeError as e: - print(e.msg) - print("JSONDecodeError.:{}".format(video_id)) + except (JSONDecodeError, PatternUnmatchError) as e: + print("{}:{}".format(e.msg, video_id)) if Arguments().save_error_data: - util.save(e.doc, "ERR_JSON_DECODE", ".dat") + util.save(e.doc, "ERR_", ".dat") except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e: print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e)) - except PatternUnmatchError: - print(f"PatternUnmatchError [{video_id}]. ") except Exception as e: print(type(e), str(e)) diff --git a/pytchat/tool/videoinfo.py b/pytchat/tool/videoinfo.py index 8510a5a..722a619 100644 --- a/pytchat/tool/videoinfo.py +++ b/pytchat/tool/videoinfo.py @@ -2,7 +2,7 @@ import httpx import json import re import time -from httpx import ConnectError, NetworkError +from httpx import ConnectError, NetworkError, TimeoutException from .. import config from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError from ..util.extract_video_id import extract_video_id @@ -83,16 +83,21 @@ class VideoInfo: def __init__(self, video_id): self.video_id = extract_video_id(video_id) + err = None for _ in range(3): try: text = self._get_page_text(self.video_id) self._parse(text) break - except PatternUnmatchError: + except (InvalidVideoIdException, UnknownConnectionError) as e: + print(str(e)) + raise e + except Exception as e: + err = e time.sleep(2) pass else: - raise PatternUnmatchError("Pattern Unmatch") + raise err def _get_page_text(self, video_id): url = f"https://www.youtube.com/embed/{video_id}" @@ -102,7 +107,7 @@ class VideoInfo: resp = httpx.get(url, headers=headers) resp.raise_for_status() break - except (ConnectError, NetworkError) as e: + except (ConnectError, NetworkError, TimeoutException) as e: err = e time.sleep(3) else: @@ -113,7 +118,7 @@ class VideoInfo: def _parse(self, text): result = re.search(pattern, text) if result is None: - raise PatternUnmatchError() + raise PatternUnmatchError(doc=text) decoder = json.JSONDecoder() res = decoder.raw_decode(result.group(1)[:-1])[0] response = self._get_item(res, item_response) diff --git a/pytchat/util/extract_video_id.py b/pytchat/util/extract_video_id.py index 75385f8..c62cd89 100644 --- a/pytchat/util/extract_video_id.py +++ b/pytchat/util/extract_video_id.py @@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11 def extract_video_id(url_or_id: str) -> str: ret = '' + if '[' in url_or_id: + url_or_id = url_or_id.replace('[', '').replace(']', '') + if type(url_or_id) != str: raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.") if len(url_or_id) == YT_VIDEO_ID_LENGTH: