Compare commits

...

12 Commits

Author SHA1 Message Date
taizan-hokouto
39d99ad4af Merge branch 'hotfix/fix_json' 2020-10-06 01:30:15 +09:00
taizan-hokouto
3675c91240 Increment version 2020-10-06 01:24:31 +09:00
taizan-hokouto
46258f625a Fix import module 2020-10-06 01:24:04 +09:00
taizan-hokouto
2cc161b589 Increment version 2020-10-06 01:20:25 +09:00
taizan-hokouto
115277e5e1 Fix handling internal error and keyboard interrupt 2020-10-06 01:19:45 +09:00
taizan-hokouto
ebf0e7c181 Fix handling json decode error and pattern unmatch 2020-10-05 21:38:51 +09:00
taizan-hokouto
3106b3e545 Merge branch 'hotfix/filepath' 2020-10-04 11:33:58 +09:00
taizan-hokouto
50816a661d Increment version 2020-10-04 11:30:07 +09:00
taizan-hokouto
6755bc8bb2 Make sure to pass fixed filepath to processor 2020-10-04 11:29:52 +09:00
taizan-hokouto
26be989b9b Merge branch 'hotfix/fix' 2020-10-04 10:32:53 +09:00
taizan-hokouto
73ad0a1f44 Increment version 2020-10-04 10:22:34 +09:00
taizan-hokouto
66b185ebf7 Fix constructing filepath 2020-10-04 10:20:14 +09:00
7 changed files with 130 additions and 100 deletions

View File

@@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup. pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
""" """
__copyright__ = 'Copyright (C) 2019 taizan-hokuto' __copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.2.7' __version__ = '0.3.2'
__license__ = 'MIT' __license__ = 'MIT'
__author__ = 'taizan-hokuto' __author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com' __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

View File

@@ -1,8 +1,11 @@
import argparse import argparse
import asyncio
try:
from asyncio import CancelledError
except ImportError:
from asyncio.futures import CancelledError
import os import os
import signal import signal
import time
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError
from pathlib import Path from pathlib import Path
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
@@ -38,6 +41,7 @@ def main():
help='Save error data when error occurs(".dat" file)') help='Save error data when error occurs(".dat" file)')
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
help='Show version') help='Show version')
Arguments(parser.parse_args().__dict__) Arguments(parser.parse_args().__dict__)
if Arguments().print_version: if Arguments().print_version:
@@ -48,84 +52,106 @@ def main():
if not Arguments().video_ids: if not Arguments().video_ids:
parser.print_help() parser.print_help()
return return
for counter, video_id in enumerate(Arguments().video_ids):
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try: if not os.path.exists(Arguments().output):
video_id = extract_video_id(video_id) print("\nThe specified directory does not exist.:{}\n".format(Arguments().output))
if os.path.exists(Arguments().output): return
if Arguments().output[-1] != "/" or Arguments().output[-1] != "\\": try:
Arguments().output = '/'.join([Arguments().output, os.path.sep]) Runner().run()
path = util.checkpath(Path.resolve(Path(Arguments().output + video_id + '.html'))) except CancelledError as e:
else: print(str(e))
raise FileNotFoundError
err = None
for _ in range(3): # retry 3 times class Runner:
def run(self) -> None:
ex = None
pbar = None
for counter, video_id in enumerate(Arguments().video_ids):
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
separated_path = str(Path(Arguments().output)) + os.path.sep
path = util.checkpath(separated_path + video_id + '.html')
try: try:
info = VideoInfo(video_id) info = VideoInfo(video_id)
break except Exception as e:
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e: print("Cannot parse video information.:{} {}".format(video_id, type(e)))
err = e if Arguments().save_error_data:
time.sleep(2) util.save(str(e), "ERR", ".dat")
continue continue
else:
print("Cannot parse video information.:{}".format(video_id)) print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}\n"
f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status_txt="Extracting")
ex = Extractor(video_id,
callback=pbar.disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: self.cancel(ex, pbar)))
data = ex.extract()
if data == []:
continue
pbar.reset("#", "=", total=len(data), status_txt="Rendering ")
processor = HTMLArchiver(path, callback=pbar.disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status_txt='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(f"Abort:{str(e)}:[{video_id}]")
except (JSONDecodeError, PatternUnmatchError) as e:
print("{}:{}".format(e.msg, video_id))
if Arguments().save_error_data: if Arguments().save_error_data:
util.save(err.doc, "ERR", ".dat") util.save(e.doc, "ERR_", ".dat")
continue except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except Exception as e:
print(f"Abort:{str(type(e))} {str(e)[:80]}")
finally:
clear_tasks()
print(f"\n" return
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")
print(f" output path: {path}") def cancel(self, ex=None, pbar=None) -> None:
duration = info.get_duration() '''Called when keyboard interrupted has occurred.
pbar = ProgressBar(total=(duration * 1000), status="Extracting") '''
ex = Extractor(video_id, print("\nKeyboard interrupted.\n")
callback=pbar._disp, if ex and pbar:
div=10) ex.cancel()
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar))) pbar.cancel()
data = ex.extract()
if data == []:
return False
pbar.reset("#", "=", total=len(data), status="Rendering ")
processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(e)
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except JSONDecodeError as e:
print(e.msg)
print("JSONDecodeError.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except PatternUnmatchError:
print(f"PatternUnmatchError [{video_id}]. ")
except Exception as e:
print(type(e), str(e))
return
def cancel(ex, pbar): def clear_tasks():
ex.cancel() '''
pbar.cancel() Clear remained tasks.
Called when internal exception has occurred or
after each extraction process is completed.
'''
async def _shutdown():
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(_shutdown())
except Exception as e:
print(e)

View File

@@ -9,21 +9,20 @@ import sys
class ProgressBar: class ProgressBar:
def __init__(self, total, status): def __init__(self, total, status_txt):
self._bar_len = 60 self._bar_len = 60
self._cancelled = False self._cancelled = False
self.reset(total=total, status=status) self.reset(total=total, status_txt=status_txt)
self._blinker = 0
def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''): def reset(self, symbol_done="=", symbol_space=" ", total=100, status_txt=''):
self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns self._console_width = shutil.get_terminal_size(fallback=(80, 24)).columns
self._symbol_done = symbol_done self._symbol_done = symbol_done
self._symbol_space = symbol_space self._symbol_space = symbol_space
self._total = total self._total = total
self._status = status self._status_txt = status_txt
self._count = 0 self._count = 0
def _disp(self, _, fetched): def disp(self, _, fetched):
self._progress(fetched, self._total) self._progress(fetched, self._total)
def _progress(self, fillin, total): def _progress(self, fillin, total):
@@ -39,11 +38,10 @@ class ProgressBar:
bar = self._symbol_done * filled_len + \ bar = self._symbol_done * filled_len + \
self._symbol_space * (self._bar_len - filled_len) self._symbol_space * (self._bar_len - filled_len)
disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r' disp = f" [{bar}] {percents:>5.1f}% ...{self._status_txt} "[:self._console_width - 1] + '\r'
sys.stdout.write(disp) sys.stdout.write(disp)
sys.stdout.flush() sys.stdout.flush()
self._blinker += 1
def close(self): def close(self):
if not self._cancelled: if not self._cancelled:

View File

@@ -1,5 +1,6 @@
import httpx
import asyncio import asyncio
import httpx
import socket
from . import parser from . import parser
from . block import Block from . block import Block
from . worker import ExtractWorker from . worker import ExtractWorker
@@ -8,7 +9,7 @@ from ... import config
from ... paramgen import arcparam from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError from concurrent.futures import CancelledError
from httpx import NetworkError, ReadTimeout from httpx import NetworkError, TimeoutException, ConnectError
from json import JSONDecodeError from json import JSONDecodeError
from urllib.parse import quote from urllib.parse import quote
@@ -75,12 +76,12 @@ def ready_blocks(video_id, duration, div, callback):
next_continuation, actions = None, [] next_continuation, actions = None, []
break break
param_set.add(continuation) param_set.add(continuation)
resp = await session.get(url, headers=headers) resp = await session.get(url, headers=headers, timeout=10)
next_continuation, actions = parser.parse(resp.json()) next_continuation, actions = parser.parse(resp.json())
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e: except (NetworkError, TimeoutException, ConnectError) as e:
err = e err = e
await asyncio.sleep(3) await asyncio.sleep(3)
else: else:
@@ -136,9 +137,12 @@ def fetch_patch(callback, blocks, video_id):
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e: except (NetworkError, TimeoutException, ConnectError) as e:
err = e err = e
await asyncio.sleep(3) await asyncio.sleep(3)
except socket.error as error:
print("socket error", error.errno)
await asyncio.sleep(3)
else: else:
cancel() cancel()
raise UnknownConnectionError("Abort:" + str(err)) raise UnknownConnectionError("Abort:" + str(err))
@@ -162,15 +166,10 @@ def fetch_patch(callback, blocks, video_id):
async def _shutdown(): async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks() tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()] if t is not asyncio.current_task()]
for task in tasks: for task in tasks:
task.cancel() task.cancel()
try:
await task
except asyncio.CancelledError:
pass
def cancel(): def cancel():

View File

@@ -93,5 +93,4 @@ class Extractor:
return ret return ret
def cancel(self): def cancel(self):
print("cancel")
asyncdl.cancel() asyncdl.cancel()

View File

@@ -2,7 +2,7 @@ import httpx
import json import json
import re import re
import time import time
from httpx import ConnectError, NetworkError from httpx import ConnectError, NetworkError, TimeoutException
from .. import config from .. import config
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
from ..util.extract_video_id import extract_video_id from ..util.extract_video_id import extract_video_id
@@ -83,16 +83,21 @@ class VideoInfo:
def __init__(self, video_id): def __init__(self, video_id):
self.video_id = extract_video_id(video_id) self.video_id = extract_video_id(video_id)
err = None
for _ in range(3): for _ in range(3):
try: try:
text = self._get_page_text(self.video_id) text = self._get_page_text(self.video_id)
self._parse(text) self._parse(text)
break break
except PatternUnmatchError: except (InvalidVideoIdException, UnknownConnectionError) as e:
print(str(e))
raise e
except Exception as e:
err = e
time.sleep(2) time.sleep(2)
pass pass
else: else:
raise PatternUnmatchError("Pattern Unmatch") raise err
def _get_page_text(self, video_id): def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}" url = f"https://www.youtube.com/embed/{video_id}"
@@ -102,7 +107,7 @@ class VideoInfo:
resp = httpx.get(url, headers=headers) resp = httpx.get(url, headers=headers)
resp.raise_for_status() resp.raise_for_status()
break break
except (ConnectError, NetworkError) as e: except (ConnectError, NetworkError, TimeoutException) as e:
err = e err = e
time.sleep(3) time.sleep(3)
else: else:
@@ -113,7 +118,7 @@ class VideoInfo:
def _parse(self, text): def _parse(self, text):
result = re.search(pattern, text) result = re.search(pattern, text)
if result is None: if result is None:
raise PatternUnmatchError() raise PatternUnmatchError(doc=text)
decoder = json.JSONDecoder() decoder = json.JSONDecoder()
res = decoder.raw_decode(result.group(1)[:-1])[0] res = decoder.raw_decode(result.group(1)[:-1])[0]
response = self._get_item(res, item_response) response = self._get_item(res, item_response)

View File

@@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11
def extract_video_id(url_or_id: str) -> str: def extract_video_id(url_or_id: str) -> str:
ret = '' ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str: if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.") raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH: if len(url_or_id) == YT_VIDEO_ID_LENGTH: