Compare commits

...

12 Commits

Author SHA1 Message Date
taizan-hokouto
39d99ad4af Merge branch 'hotfix/fix_json' 2020-10-06 01:30:15 +09:00
taizan-hokouto
3675c91240 Increment version 2020-10-06 01:24:31 +09:00
taizan-hokouto
46258f625a Fix import module 2020-10-06 01:24:04 +09:00
taizan-hokouto
2cc161b589 Increment version 2020-10-06 01:20:25 +09:00
taizan-hokouto
115277e5e1 Fix handling internal error and keyboard interrupt 2020-10-06 01:19:45 +09:00
taizan-hokouto
ebf0e7c181 Fix handling json decode error and pattern unmatch 2020-10-05 21:38:51 +09:00
taizan-hokouto
3106b3e545 Merge branch 'hotfix/filepath' 2020-10-04 11:33:58 +09:00
taizan-hokouto
50816a661d Increment version 2020-10-04 11:30:07 +09:00
taizan-hokouto
6755bc8bb2 Make sure to pass fixed filepath to processor 2020-10-04 11:29:52 +09:00
taizan-hokouto
26be989b9b Merge branch 'hotfix/fix' 2020-10-04 10:32:53 +09:00
taizan-hokouto
73ad0a1f44 Increment version 2020-10-04 10:22:34 +09:00
taizan-hokouto
66b185ebf7 Fix constructing filepath 2020-10-04 10:20:14 +09:00
7 changed files with 130 additions and 100 deletions

View File

@@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
"""
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.2.7'
__version__ = '0.3.2'
__license__ = 'MIT'
__author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

View File

@@ -1,8 +1,11 @@
import argparse
import asyncio
try:
from asyncio import CancelledError
except ImportError:
from asyncio.futures import CancelledError
import os
import signal
import time
from json.decoder import JSONDecodeError
from pathlib import Path
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
@@ -38,6 +41,7 @@ def main():
help='Save error data when error occurs(".dat" file)')
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
help='Show version')
Arguments(parser.parse_args().__dict__)
if Arguments().print_version:
@@ -48,84 +52,106 @@ def main():
if not Arguments().video_ids:
parser.print_help()
return
for counter, video_id in enumerate(Arguments().video_ids):
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
if os.path.exists(Arguments().output):
if Arguments().output[-1] != "/" or Arguments().output[-1] != "\\":
Arguments().output = '/'.join([Arguments().output, os.path.sep])
path = util.checkpath(Path.resolve(Path(Arguments().output + video_id + '.html')))
else:
raise FileNotFoundError
err = None
for _ in range(3): # retry 3 times
if not os.path.exists(Arguments().output):
print("\nThe specified directory does not exist.:{}\n".format(Arguments().output))
return
try:
Runner().run()
except CancelledError as e:
print(str(e))
class Runner:
def run(self) -> None:
ex = None
pbar = None
for counter, video_id in enumerate(Arguments().video_ids):
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
separated_path = str(Path(Arguments().output)) + os.path.sep
path = util.checkpath(separated_path + video_id + '.html')
try:
info = VideoInfo(video_id)
break
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
err = e
time.sleep(2)
except Exception as e:
print("Cannot parse video information.:{} {}".format(video_id, type(e)))
if Arguments().save_error_data:
util.save(str(e), "ERR", ".dat")
continue
else:
print("Cannot parse video information.:{}".format(video_id))
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}\n"
f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status_txt="Extracting")
ex = Extractor(video_id,
callback=pbar.disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: self.cancel(ex, pbar)))
data = ex.extract()
if data == []:
continue
pbar.reset("#", "=", total=len(data), status_txt="Rendering ")
processor = HTMLArchiver(path, callback=pbar.disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status_txt='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(f"Abort:{str(e)}:[{video_id}]")
except (JSONDecodeError, PatternUnmatchError) as e:
print("{}:{}".format(e.msg, video_id))
if Arguments().save_error_data:
util.save(err.doc, "ERR", ".dat")
continue
util.save(e.doc, "ERR_", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except Exception as e:
print(f"Abort:{str(type(e))} {str(e)[:80]}")
finally:
clear_tasks()
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")
return
print(f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status="Extracting")
ex = Extractor(video_id,
callback=pbar._disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
data = ex.extract()
if data == []:
return False
pbar.reset("#", "=", total=len(data), status="Rendering ")
processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(e)
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except JSONDecodeError as e:
print(e.msg)
print("JSONDecodeError.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except PatternUnmatchError:
print(f"PatternUnmatchError [{video_id}]. ")
except Exception as e:
print(type(e), str(e))
return
def cancel(self, ex=None, pbar=None) -> None:
'''Called when keyboard interrupted has occurred.
'''
print("\nKeyboard interrupted.\n")
if ex and pbar:
ex.cancel()
pbar.cancel()
def cancel(ex, pbar):
ex.cancel()
pbar.cancel()
def clear_tasks():
'''
Clear remained tasks.
Called when internal exception has occurred or
after each extraction process is completed.
'''
async def _shutdown():
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(_shutdown())
except Exception as e:
print(e)

View File

@@ -9,21 +9,20 @@ import sys
class ProgressBar:
def __init__(self, total, status):
def __init__(self, total, status_txt):
self._bar_len = 60
self._cancelled = False
self.reset(total=total, status=status)
self._blinker = 0
self.reset(total=total, status_txt=status_txt)
def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''):
self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns
def reset(self, symbol_done="=", symbol_space=" ", total=100, status_txt=''):
self._console_width = shutil.get_terminal_size(fallback=(80, 24)).columns
self._symbol_done = symbol_done
self._symbol_space = symbol_space
self._total = total
self._status = status
self._status_txt = status_txt
self._count = 0
def _disp(self, _, fetched):
def disp(self, _, fetched):
self._progress(fetched, self._total)
def _progress(self, fillin, total):
@@ -39,11 +38,10 @@ class ProgressBar:
bar = self._symbol_done * filled_len + \
self._symbol_space * (self._bar_len - filled_len)
disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r'
disp = f" [{bar}] {percents:>5.1f}% ...{self._status_txt} "[:self._console_width - 1] + '\r'
sys.stdout.write(disp)
sys.stdout.flush()
self._blinker += 1
def close(self):
if not self._cancelled:

View File

@@ -1,5 +1,6 @@
import httpx
import asyncio
import httpx
import socket
from . import parser
from . block import Block
from . worker import ExtractWorker
@@ -8,7 +9,7 @@ from ... import config
from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError
from httpx import NetworkError, ReadTimeout
from httpx import NetworkError, TimeoutException, ConnectError
from json import JSONDecodeError
from urllib.parse import quote
@@ -75,12 +76,12 @@ def ready_blocks(video_id, duration, div, callback):
next_continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=headers)
resp = await session.get(url, headers=headers, timeout=10)
next_continuation, actions = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
else:
@@ -136,9 +137,12 @@ def fetch_patch(callback, blocks, video_id):
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
except socket.error as error:
print("socket error", error.errno)
await asyncio.sleep(3)
else:
cancel()
raise UnknownConnectionError("Abort:" + str(err))
@@ -162,15 +166,10 @@ def fetch_patch(callback, blocks, video_id):
async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
def cancel():

View File

@@ -93,5 +93,4 @@ class Extractor:
return ret
def cancel(self):
print("cancel")
asyncdl.cancel()

View File

@@ -2,7 +2,7 @@ import httpx
import json
import re
import time
from httpx import ConnectError, NetworkError
from httpx import ConnectError, NetworkError, TimeoutException
from .. import config
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
from ..util.extract_video_id import extract_video_id
@@ -83,16 +83,21 @@ class VideoInfo:
def __init__(self, video_id):
self.video_id = extract_video_id(video_id)
err = None
for _ in range(3):
try:
text = self._get_page_text(self.video_id)
self._parse(text)
break
except PatternUnmatchError:
except (InvalidVideoIdException, UnknownConnectionError) as e:
print(str(e))
raise e
except Exception as e:
err = e
time.sleep(2)
pass
else:
raise PatternUnmatchError("Pattern Unmatch")
raise err
def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}"
@@ -102,7 +107,7 @@ class VideoInfo:
resp = httpx.get(url, headers=headers)
resp.raise_for_status()
break
except (ConnectError, NetworkError) as e:
except (ConnectError, NetworkError, TimeoutException) as e:
err = e
time.sleep(3)
else:
@@ -113,7 +118,7 @@ class VideoInfo:
def _parse(self, text):
result = re.search(pattern, text)
if result is None:
raise PatternUnmatchError()
raise PatternUnmatchError(doc=text)
decoder = json.JSONDecoder()
res = decoder.raw_decode(result.group(1)[:-1])[0]
response = self._get_item(res, item_response)

View File

@@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11
def extract_video_id(url_or_id: str) -> str:
ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH: