From 748778f545dd322afa98a266d427890f6857f152 Mon Sep 17 00:00:00 2001 From: taizan_hokuto <55448286+taizan-hokuto@users.noreply.github.com> Date: Sat, 3 Oct 2020 22:04:09 +0900 Subject: [PATCH 1/2] Fix pattern matching --- pytchat/cli/__init__.py | 7 +++++-- pytchat/processors/html_archiver.py | 5 +++-- pytchat/tool/videoinfo.py | 18 +++++++++++++----- pytchat/util/__init__.py | 24 +++++++++++++++++++++++- 4 files changed, 44 insertions(+), 10 deletions(-) diff --git a/pytchat/cli/__init__.py b/pytchat/cli/__init__.py index f999b04..451964f 100644 --- a/pytchat/cli/__init__.py +++ b/pytchat/cli/__init__.py @@ -57,7 +57,10 @@ def main(): try: video_id = extract_video_id(video_id) if os.path.exists(Arguments().output): - path = Path(Arguments().output + video_id + '.html') + if Arguments().output[-1] != "/" or Arguments().output[-1] != "\\": + Arguments().output = '/'.join([Arguments().output, os.path.sep]) + path = util.checkpath(Path.resolve(Path(Arguments().output + video_id + '.html'))) + print(path) else: raise FileNotFoundError err = None @@ -80,7 +83,7 @@ def main(): f" channel: {info.get_channel_name()}\n" f" title: {info.get_title()}") - print(f" output path: {path.resolve()}") + print(f" output path: {path}") duration = info.get_duration() pbar = ProgressBar(total=(duration * 1000), status="Extracting") ex = Extractor(video_id, diff --git a/pytchat/processors/html_archiver.py b/pytchat/processors/html_archiver.py index 9e08c40..95ada14 100644 --- a/pytchat/processors/html_archiver.py +++ b/pytchat/processors/html_archiver.py @@ -116,11 +116,12 @@ class HTMLArchiver(ChatProcessor): def _encode_img(self, url): err = None - for _ in range(3): + for _ in range(5): try: - resp = httpx.get(url) + resp = httpx.get(url, timeout=30) break except (NetworkError, ReadTimeout) as e: + print("Network Error. retrying...") err = e time.sleep(3) else: diff --git a/pytchat/tool/videoinfo.py b/pytchat/tool/videoinfo.py index e1d1d0a..8510a5a 100644 --- a/pytchat/tool/videoinfo.py +++ b/pytchat/tool/videoinfo.py @@ -9,8 +9,8 @@ from ..util.extract_video_id import extract_video_id headers = config.headers - -pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})") + +pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})") item_channel_id = [ "videoDetails", @@ -83,8 +83,16 @@ class VideoInfo: def __init__(self, video_id): self.video_id = extract_video_id(video_id) - text = self._get_page_text(self.video_id) - self._parse(text) + for _ in range(3): + try: + text = self._get_page_text(self.video_id) + self._parse(text) + break + except PatternUnmatchError: + time.sleep(2) + pass + else: + raise PatternUnmatchError("Pattern Unmatch") def _get_page_text(self, video_id): url = f"https://www.youtube.com/embed/{video_id}" @@ -105,7 +113,7 @@ class VideoInfo: def _parse(self, text): result = re.search(pattern, text) if result is None: - raise PatternUnmatchError(text) + raise PatternUnmatchError() decoder = json.JSONDecoder() res = decoder.raw_decode(result.group(1)[:-1])[0] response = self._get_item(res, item_response) diff --git a/pytchat/util/__init__.py b/pytchat/util/__init__.py index fc31dab..d215c5d 100644 --- a/pytchat/util/__init__.py +++ b/pytchat/util/__init__.py @@ -1,8 +1,12 @@ +import datetime import httpx import json -import datetime +import os +import re from .. import config +PATTERN = re.compile(r"(.*)\(([0-9]+)\)$") + def extract(url): _session = httpx.Client(http2=True) @@ -16,3 +20,21 @@ def save(data, filename, extention): with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention, mode='w', encoding='utf-8') as f: f.writelines(data) + + +def checkpath(filepath): + splitter = os.path.splitext(os.path.basename(filepath)) + body = splitter[0] + extention = splitter[1] + newpath = filepath + counter = 1 + while os.path.exists(newpath): + match = re.search(PATTERN, body) + if match: + counter = int(match[2]) + 1 + num_with_bracket = f'({str(counter)})' + body = f'{match[1]}{num_with_bracket}' + else: + body = f'{body}({str(counter)})' + newpath = os.path.join(os.path.dirname(filepath), body + extention) + return newpath From b3e6275de7d3f364a9f805c1109f7a627a194469 Mon Sep 17 00:00:00 2001 From: taizan_hokuto <55448286+taizan-hokuto@users.noreply.github.com> Date: Sat, 3 Oct 2020 22:35:22 +0900 Subject: [PATCH 2/2] Increment version --- pytchat/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytchat/__init__.py b/pytchat/__init__.py index bba3332..661ba67 100644 --- a/pytchat/__init__.py +++ b/pytchat/__init__.py @@ -2,7 +2,7 @@ pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup. """ __copyright__ = 'Copyright (C) 2019 taizan-hokuto' -__version__ = '0.2.5' +__version__ = '0.2.6' __license__ = 'MIT' __author__ = 'taizan-hokuto' __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'