Merge tag 'fix_json' into develop

v0.3.2
This commit is contained in:
taizan-hokouto
2020-10-06 01:30:16 +09:00
7 changed files with 130 additions and 98 deletions

View File

@@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
"""
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.3.0'
__version__ = '0.3.2'
__license__ = 'MIT'
__author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

View File

@@ -1,8 +1,11 @@
import argparse
import asyncio
try:
from asyncio import CancelledError
except ImportError:
from asyncio.futures import CancelledError
import os
import signal
import time
from json.decoder import JSONDecodeError
from pathlib import Path
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
@@ -38,6 +41,7 @@ def main():
help='Save error data when error occurs(".dat" file)')
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
help='Show version')
Arguments(parser.parse_args().__dict__)
if Arguments().print_version:
@@ -48,82 +52,106 @@ def main():
if not Arguments().video_ids:
parser.print_help()
return
for counter, video_id in enumerate(Arguments().video_ids):
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
if not os.path.exists(Arguments().output):
raise FileNotFoundError
separated_path = str(Path(Arguments().output)) + os.path.sep
path = util.checkpath(separated_path + video_id + '.html')
err = None
for _ in range(3): # retry 3 times
if not os.path.exists(Arguments().output):
print("\nThe specified directory does not exist.:{}\n".format(Arguments().output))
return
try:
Runner().run()
except CancelledError as e:
print(str(e))
class Runner:
def run(self) -> None:
ex = None
pbar = None
for counter, video_id in enumerate(Arguments().video_ids):
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
separated_path = str(Path(Arguments().output)) + os.path.sep
path = util.checkpath(separated_path + video_id + '.html')
try:
info = VideoInfo(video_id)
break
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
err = e
time.sleep(2)
except Exception as e:
print("Cannot parse video information.:{} {}".format(video_id, type(e)))
if Arguments().save_error_data:
util.save(str(e), "ERR", ".dat")
continue
else:
print("Cannot parse video information.:{}".format(video_id))
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}\n"
f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status_txt="Extracting")
ex = Extractor(video_id,
callback=pbar.disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: self.cancel(ex, pbar)))
data = ex.extract()
if data == []:
continue
pbar.reset("#", "=", total=len(data), status_txt="Rendering ")
processor = HTMLArchiver(path, callback=pbar.disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status_txt='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(f"Abort:{str(e)}:[{video_id}]")
except (JSONDecodeError, PatternUnmatchError) as e:
print("{}:{}".format(e.msg, video_id))
if Arguments().save_error_data:
util.save(err.doc, "ERR", ".dat")
continue
util.save(e.doc, "ERR_", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except Exception as e:
print(f"Abort:{str(type(e))} {str(e)[:80]}")
finally:
clear_tasks()
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")
return
print(f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status="Extracting")
ex = Extractor(video_id,
callback=pbar._disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
data = ex.extract()
if data == []:
return False
pbar.reset("#", "=", total=len(data), status="Rendering ")
processor = HTMLArchiver(path, callback=pbar._disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(e)
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except JSONDecodeError as e:
print(e.msg)
print("JSONDecodeError.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except PatternUnmatchError:
print(f"PatternUnmatchError [{video_id}]. ")
except Exception as e:
print(type(e), str(e))
return
def cancel(self, ex=None, pbar=None) -> None:
'''Called when keyboard interrupted has occurred.
'''
print("\nKeyboard interrupted.\n")
if ex and pbar:
ex.cancel()
pbar.cancel()
def cancel(ex, pbar):
ex.cancel()
pbar.cancel()
def clear_tasks():
'''
Clear remained tasks.
Called when internal exception has occurred or
after each extraction process is completed.
'''
async def _shutdown():
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(_shutdown())
except Exception as e:
print(e)

View File

@@ -9,21 +9,20 @@ import sys
class ProgressBar:
def __init__(self, total, status):
def __init__(self, total, status_txt):
self._bar_len = 60
self._cancelled = False
self.reset(total=total, status=status)
self._blinker = 0
self.reset(total=total, status_txt=status_txt)
def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''):
self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns
def reset(self, symbol_done="=", symbol_space=" ", total=100, status_txt=''):
self._console_width = shutil.get_terminal_size(fallback=(80, 24)).columns
self._symbol_done = symbol_done
self._symbol_space = symbol_space
self._total = total
self._status = status
self._status_txt = status_txt
self._count = 0
def _disp(self, _, fetched):
def disp(self, _, fetched):
self._progress(fetched, self._total)
def _progress(self, fillin, total):
@@ -39,11 +38,10 @@ class ProgressBar:
bar = self._symbol_done * filled_len + \
self._symbol_space * (self._bar_len - filled_len)
disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r'
disp = f" [{bar}] {percents:>5.1f}% ...{self._status_txt} "[:self._console_width - 1] + '\r'
sys.stdout.write(disp)
sys.stdout.flush()
self._blinker += 1
def close(self):
if not self._cancelled:

View File

@@ -1,5 +1,6 @@
import httpx
import asyncio
import httpx
import socket
from . import parser
from . block import Block
from . worker import ExtractWorker
@@ -8,7 +9,7 @@ from ... import config
from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError
from httpx import NetworkError, ReadTimeout
from httpx import NetworkError, TimeoutException, ConnectError
from json import JSONDecodeError
from urllib.parse import quote
@@ -75,12 +76,12 @@ def ready_blocks(video_id, duration, div, callback):
next_continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=headers)
resp = await session.get(url, headers=headers, timeout=10)
next_continuation, actions = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
else:
@@ -136,9 +137,12 @@ def fetch_patch(callback, blocks, video_id):
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
except socket.error as error:
print("socket error", error.errno)
await asyncio.sleep(3)
else:
cancel()
raise UnknownConnectionError("Abort:" + str(err))
@@ -162,15 +166,10 @@ def fetch_patch(callback, blocks, video_id):
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
def cancel():

View File

@@ -93,5 +93,4 @@ class Extractor:
return ret
def cancel(self):
print("cancel")
asyncdl.cancel()

View File

@@ -2,7 +2,7 @@ import httpx
import json
import re
import time
from httpx import ConnectError, NetworkError
from httpx import ConnectError, NetworkError, TimeoutException
from .. import config
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
from ..util.extract_video_id import extract_video_id
@@ -83,16 +83,21 @@ class VideoInfo:
def __init__(self, video_id):
self.video_id = extract_video_id(video_id)
err = None
for _ in range(3):
try:
text = self._get_page_text(self.video_id)
self._parse(text)
break
except PatternUnmatchError:
except (InvalidVideoIdException, UnknownConnectionError) as e:
print(str(e))
raise e
except Exception as e:
err = e
time.sleep(2)
pass
else:
raise PatternUnmatchError("Pattern Unmatch")
raise err
def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}"
@@ -102,7 +107,7 @@ class VideoInfo:
resp = httpx.get(url, headers=headers)
resp.raise_for_status()
break
except (ConnectError, NetworkError) as e:
except (ConnectError, NetworkError, TimeoutException) as e:
err = e
time.sleep(3)
else:
@@ -113,7 +118,7 @@ class VideoInfo:
def _parse(self, text):
result = re.search(pattern, text)
if result is None:
raise PatternUnmatchError()
raise PatternUnmatchError(doc=text)
decoder = json.JSONDecoder()
res = decoder.raw_decode(result.group(1)[:-1])[0]
response = self._get_item(res, item_response)

View File

@@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11
def extract_video_id(url_or_id: str) -> str:
ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH: