Compare commits

...

28 Commits

Author SHA1 Message Date
taizan-hokouto
39d99ad4af Merge branch 'hotfix/fix_json' 2020-10-06 01:30:15 +09:00
taizan-hokouto
3675c91240 Increment version 2020-10-06 01:24:31 +09:00
taizan-hokouto
46258f625a Fix import module 2020-10-06 01:24:04 +09:00
taizan-hokouto
2cc161b589 Increment version 2020-10-06 01:20:25 +09:00
taizan-hokouto
115277e5e1 Fix handling internal error and keyboard interrupt 2020-10-06 01:19:45 +09:00
taizan-hokouto
ebf0e7c181 Fix handling json decode error and pattern unmatch 2020-10-05 21:38:51 +09:00
taizan-hokouto
3106b3e545 Merge branch 'hotfix/filepath' 2020-10-04 11:33:58 +09:00
taizan-hokouto
50816a661d Increment version 2020-10-04 11:30:07 +09:00
taizan-hokouto
6755bc8bb2 Make sure to pass fixed filepath to processor 2020-10-04 11:29:52 +09:00
taizan-hokouto
26be989b9b Merge branch 'hotfix/fix' 2020-10-04 10:32:53 +09:00
taizan-hokouto
73ad0a1f44 Increment version 2020-10-04 10:22:34 +09:00
taizan-hokouto
66b185ebf7 Fix constructing filepath 2020-10-04 10:20:14 +09:00
taizan_hokuto
71650c39f7 Merge branch 'hotfix/fix' 2020-10-03 22:42:48 +09:00
taizan_hokuto
488445c73b Increment version 2020-10-03 22:41:53 +09:00
taizan_hokuto
075e811efe Delete unnecessary code 2020-10-03 22:41:12 +09:00
taizan_hokuto
58d9bf7fdb Merge branch 'hotfix/pattern' 2020-10-03 22:35:46 +09:00
taizan_hokuto
b3e6275de7 Increment version 2020-10-03 22:35:22 +09:00
taizan_hokuto
748778f545 Fix pattern matching 2020-10-03 22:04:09 +09:00
taizan-hokuto
e29b3b8377 Merge branch 'hotfix/network' 2020-09-14 00:40:40 +09:00
taizan-hokuto
0859ed5fb1 Increment version 2020-09-14 00:29:21 +09:00
taizan-hokuto
a80d5ba080 Fix handling network error 2020-09-14 00:28:41 +09:00
taizan-hokuto
b7e6043a71 Merge branch 'hotfix/memory' 2020-09-12 02:12:46 +09:00
taizan-hokuto
820ba35013 Increment version 2020-09-12 02:02:07 +09:00
taizan-hokuto
ecd2d130bf Clear set each time the extraction changes 2020-09-12 01:57:55 +09:00
taizan-hokuto
f77a2c889b Merge branch 'hotfix/not_quit' 2020-09-12 00:57:48 +09:00
taizan-hokuto
47d5ab288f Increment version 2020-09-12 00:49:37 +09:00
taizan-hokuto
5f53fd24dd Format 2020-09-12 00:48:40 +09:00
taizan-hokuto
11a9d0e2d7 Fix a problem with extraction not completing 2020-09-12 00:42:30 +09:00
10 changed files with 219 additions and 101 deletions

View File

@@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup. pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
""" """
__copyright__ = 'Copyright (C) 2019 taizan-hokuto' __copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.2.2' __version__ = '0.3.2'
__license__ = 'MIT' __license__ = 'MIT'
__author__ = 'taizan-hokuto' __author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com' __author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

View File

@@ -1,13 +1,17 @@
import argparse import argparse
import asyncio
try:
from asyncio import CancelledError
except ImportError:
from asyncio.futures import CancelledError
import os import os
import signal import signal
import time
from json.decoder import JSONDecodeError from json.decoder import JSONDecodeError
from pathlib import Path from pathlib import Path
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
from .arguments import Arguments from .arguments import Arguments
from .progressbar import ProgressBar from .progressbar import ProgressBar
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
from .. processors.html_archiver import HTMLArchiver from .. processors.html_archiver import HTMLArchiver
from .. tool.extract.extractor import Extractor from .. tool.extract.extractor import Extractor
from .. tool.videoinfo import VideoInfo from .. tool.videoinfo import VideoInfo
@@ -37,6 +41,7 @@ def main():
help='Save error data when error occurs(".dat" file)') help='Save error data when error occurs(".dat" file)')
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
help='Show version') help='Show version')
Arguments(parser.parse_args().__dict__) Arguments(parser.parse_args().__dict__)
if Arguments().print_version: if Arguments().print_version:
@@ -47,75 +52,106 @@ def main():
if not Arguments().video_ids: if not Arguments().video_ids:
parser.print_help() parser.print_help()
return return
for counter, video_id in enumerate(Arguments().video_ids):
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
try:
video_id = extract_video_id(video_id)
if os.path.exists(Arguments().output):
path = Path(Arguments().output + video_id + '.html')
else:
raise FileNotFoundError
err = None
for _ in range(3): # retry 3 times
try:
info = VideoInfo(video_id)
break
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
err = e
time.sleep(2)
continue
else:
print("Cannot parse video information.:{}".format(video_id))
if Arguments().save_error_data:
util.save(err.doc, "ERR", ".dat")
continue
if not os.path.exists(Arguments().output):
print("\nThe specified directory does not exist.:{}\n".format(Arguments().output))
return
try:
Runner().run()
except CancelledError as e:
print(str(e))
class Runner:
def run(self) -> None:
ex = None
pbar = None
for counter, video_id in enumerate(Arguments().video_ids):
if len(Arguments().video_ids) > 1: if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}") print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}")
print(f" output path: {path.resolve()}") try:
duration = info.get_duration() video_id = extract_video_id(video_id)
pbar = ProgressBar(total=(duration * 1000), status="Extracting") separated_path = str(Path(Arguments().output)) + os.path.sep
ex = Extractor(video_id, path = util.checkpath(separated_path + video_id + '.html')
callback=pbar._disp, try:
div=10) info = VideoInfo(video_id)
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar))) except Exception as e:
data = ex.extract() print("Cannot parse video information.:{} {}".format(video_id, type(e)))
if data == []: if Arguments().save_error_data:
return False util.save(str(e), "ERR", ".dat")
pbar.reset("#", "=", total=len(data), status="Rendering ") continue
processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(e)
except FileNotFoundError:
print("The specified directory does not exist.:{}".format(Arguments().output))
except JSONDecodeError as e:
print(e.msg)
print("JSONDecodeError.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
return print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
f" title: {info.get_title()}\n"
f" output path: {path}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status_txt="Extracting")
ex = Extractor(video_id,
callback=pbar.disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: self.cancel(ex, pbar)))
data = ex.extract()
if data == []:
continue
pbar.reset("#", "=", total=len(data), status_txt="Rendering ")
processor = HTMLArchiver(path, callback=pbar.disp)
processor.process(
[{'video_id': None,
'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
)
processor.finalize()
pbar.reset('#', '#', status_txt='Completed ')
pbar.close()
print()
if pbar.is_cancelled():
print("\nThe extraction process has been discontinued.\n")
except InvalidVideoIdException:
print("Invalid Video ID or URL:", video_id)
except NoContents as e:
print(f"Abort:{str(e)}:[{video_id}]")
except (JSONDecodeError, PatternUnmatchError) as e:
print("{}:{}".format(e.msg, video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except Exception as e:
print(f"Abort:{str(type(e))} {str(e)[:80]}")
finally:
clear_tasks()
return
def cancel(self, ex=None, pbar=None) -> None:
'''Called when keyboard interrupted has occurred.
'''
print("\nKeyboard interrupted.\n")
if ex and pbar:
ex.cancel()
pbar.cancel()
def cancel(ex, pbar): def clear_tasks():
ex.cancel() '''
pbar.cancel() Clear remained tasks.
Called when internal exception has occurred or
after each extraction process is completed.
'''
async def _shutdown():
tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()]
for task in tasks:
task.cancel()
try:
loop = asyncio.get_event_loop()
loop.run_until_complete(_shutdown())
except Exception as e:
print(e)

View File

@@ -9,21 +9,20 @@ import sys
class ProgressBar: class ProgressBar:
def __init__(self, total, status): def __init__(self, total, status_txt):
self._bar_len = 60 self._bar_len = 60
self._cancelled = False self._cancelled = False
self.reset(total=total, status=status) self.reset(total=total, status_txt=status_txt)
self._blinker = 0
def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''): def reset(self, symbol_done="=", symbol_space=" ", total=100, status_txt=''):
self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns self._console_width = shutil.get_terminal_size(fallback=(80, 24)).columns
self._symbol_done = symbol_done self._symbol_done = symbol_done
self._symbol_space = symbol_space self._symbol_space = symbol_space
self._total = total self._total = total
self._status = status self._status_txt = status_txt
self._count = 0 self._count = 0
def _disp(self, _, fetched): def disp(self, _, fetched):
self._progress(fetched, self._total) self._progress(fetched, self._total)
def _progress(self, fillin, total): def _progress(self, fillin, total):
@@ -39,11 +38,10 @@ class ProgressBar:
bar = self._symbol_done * filled_len + \ bar = self._symbol_done * filled_len + \
self._symbol_space * (self._bar_len - filled_len) self._symbol_space * (self._bar_len - filled_len)
disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r' disp = f" [{bar}] {percents:>5.1f}% ...{self._status_txt} "[:self._console_width - 1] + '\r'
sys.stdout.write(disp) sys.stdout.write(disp)
sys.stdout.flush() sys.stdout.flush()
self._blinker += 1
def close(self): def close(self):
if not self._cancelled: if not self._cancelled:

View File

@@ -43,7 +43,6 @@ class InvalidVideoIdException(Exception):
self.doc = doc self.doc = doc
class UnknownConnectionError(Exception): class UnknownConnectionError(Exception):
pass pass

View File

@@ -1,9 +1,12 @@
import httpx
import os import os
import re import re
import httpx import time
from base64 import standard_b64encode from base64 import standard_b64encode
from httpx import NetworkError, ReadTimeout
from .chat_processor import ChatProcessor from .chat_processor import ChatProcessor
from .default.processor import DefaultProcessor from .default.processor import DefaultProcessor
from ..exceptions import UnknownConnectionError
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$") PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
@@ -112,7 +115,18 @@ class HTMLArchiver(ChatProcessor):
for item in message_items) for item in message_items)
def _encode_img(self, url): def _encode_img(self, url):
resp = httpx.get(url) err = None
for _ in range(5):
try:
resp = httpx.get(url, timeout=30)
break
except (NetworkError, ReadTimeout) as e:
print("Network Error. retrying...")
err = e
time.sleep(3)
else:
raise UnknownConnectionError(str(err))
return standard_b64encode(resp.content).decode() return standard_b64encode(resp.content).decode()
def _set_emoji_table(self, item: dict): def _set_emoji_table(self, item: dict):

View File

@@ -1,5 +1,6 @@
import httpx
import asyncio import asyncio
import httpx
import socket
from . import parser from . import parser
from . block import Block from . block import Block
from . worker import ExtractWorker from . worker import ExtractWorker
@@ -8,14 +9,19 @@ from ... import config
from ... paramgen import arcparam from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError from concurrent.futures import CancelledError
from httpx import NetworkError, TimeoutException, ConnectError
from json import JSONDecodeError from json import JSONDecodeError
from urllib.parse import quote from urllib.parse import quote
headers = config.headers headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \ REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation=" "get_live_chat_replay?continuation="
MAX_RETRY_COUNT = 3 MAX_RETRY_COUNT = 3
# Set to avoid duplicate parameters
param_set = set()
def _split(start, end, count, min_interval_sec=120): def _split(start, end, count, min_interval_sec=120):
""" """
@@ -50,6 +56,7 @@ def _split(start, end, count, min_interval_sec=120):
def ready_blocks(video_id, duration, div, callback): def ready_blocks(video_id, duration, div, callback):
param_set.clear()
if div <= 0: if div <= 0:
raise ValueError raise ValueError
@@ -62,16 +69,24 @@ def ready_blocks(video_id, duration, div, callback):
async def _create_block(session, video_id, seektime, callback): async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime=seektime) continuation = arcparam.getparam(video_id, seektime=seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None
for _ in range(MAX_RETRY_COUNT): for _ in range(MAX_RETRY_COUNT):
try: try:
resp = await session.get(url, headers=headers) if continuation in param_set:
next_continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=headers, timeout=10)
next_continuation, actions = parser.parse(resp.json()) next_continuation, actions = parser.parse(resp.json())
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
else: else:
cancel() cancel()
raise UnknownConnectionError("Abort: Unknown connection error.") raise UnknownConnectionError("Abort:" + str(err))
if actions: if actions:
first = parser.get_offset(actions[0]) first = parser.get_offset(actions[0])
@@ -110,16 +125,27 @@ def fetch_patch(callback, blocks, video_id):
async def _fetch(continuation, session) -> Patch: async def _fetch(continuation, session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None
for _ in range(MAX_RETRY_COUNT): for _ in range(MAX_RETRY_COUNT):
try: try:
if continuation in param_set:
continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=config.headers) resp = await session.get(url, headers=config.headers)
continuation, actions = parser.parse(resp.json()) continuation, actions = parser.parse(resp.json())
break break
except JSONDecodeError: except JSONDecodeError:
await asyncio.sleep(3) await asyncio.sleep(3)
except (NetworkError, TimeoutException, ConnectError) as e:
err = e
await asyncio.sleep(3)
except socket.error as error:
print("socket error", error.errno)
await asyncio.sleep(3)
else: else:
cancel() cancel()
raise UnknownConnectionError("Abort: Unknown connection error.") raise UnknownConnectionError("Abort:" + str(err))
if actions: if actions:
last = parser.get_offset(actions[-1]) last = parser.get_offset(actions[-1])
@@ -140,15 +166,10 @@ def fetch_patch(callback, blocks, video_id):
async def _shutdown(): async def _shutdown():
print("\nshutdown...")
tasks = [t for t in asyncio.all_tasks() tasks = [t for t in asyncio.all_tasks()
if t is not asyncio.current_task()] if t is not asyncio.current_task()]
for task in tasks: for task in tasks:
task.cancel() task.cancel()
try:
await task
except asyncio.CancelledError:
pass
def cancel(): def cancel():

View File

@@ -7,7 +7,6 @@ from typing import Tuple
class ExtractWorker: class ExtractWorker:
""" """
ExtractWorker associates a download session with a block. ExtractWorker associates a download session with a block.
When the worker finishes fetching, the block When the worker finishes fetching, the block
being fetched is splitted and assigned the free worker. being fetched is splitted and assigned the free worker.

View File

@@ -1,13 +1,16 @@
import httpx
import json import json
import re import re
import httpx import time
from httpx import ConnectError, NetworkError, TimeoutException
from .. import config from .. import config
from ..exceptions import InvalidVideoIdException, PatternUnmatchError from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
from ..util.extract_video_id import extract_video_id from ..util.extract_video_id import extract_video_id
headers = config.headers
pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})") headers = config.headers
pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})")
item_channel_id = [ item_channel_id = [
"videoDetails", "videoDetails",
@@ -80,19 +83,42 @@ class VideoInfo:
def __init__(self, video_id): def __init__(self, video_id):
self.video_id = extract_video_id(video_id) self.video_id = extract_video_id(video_id)
text = self._get_page_text(self.video_id) err = None
self._parse(text) for _ in range(3):
try:
text = self._get_page_text(self.video_id)
self._parse(text)
break
except (InvalidVideoIdException, UnknownConnectionError) as e:
print(str(e))
raise e
except Exception as e:
err = e
time.sleep(2)
pass
else:
raise err
def _get_page_text(self, video_id): def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}" url = f"https://www.youtube.com/embed/{video_id}"
resp = httpx.get(url, headers=headers) err = None
resp.raise_for_status() for _ in range(3):
try:
resp = httpx.get(url, headers=headers)
resp.raise_for_status()
break
except (ConnectError, NetworkError, TimeoutException) as e:
err = e
time.sleep(3)
else:
raise UnknownConnectionError(str(err))
return resp.text return resp.text
def _parse(self, text): def _parse(self, text):
result = re.search(pattern, text) result = re.search(pattern, text)
if result is None: if result is None:
raise PatternUnmatchError(text) raise PatternUnmatchError(doc=text)
decoder = json.JSONDecoder() decoder = json.JSONDecoder()
res = decoder.raw_decode(result.group(1)[:-1])[0] res = decoder.raw_decode(result.group(1)[:-1])[0]
response = self._get_item(res, item_response) response = self._get_item(res, item_response)

View File

@@ -1,8 +1,12 @@
import datetime
import httpx import httpx
import json import json
import datetime import os
import re
from .. import config from .. import config
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
def extract(url): def extract(url):
_session = httpx.Client(http2=True) _session = httpx.Client(http2=True)
@@ -16,3 +20,21 @@ def save(data, filename, extention):
with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention, with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention,
mode='w', encoding='utf-8') as f: mode='w', encoding='utf-8') as f:
f.writelines(data) f.writelines(data)
def checkpath(filepath):
splitter = os.path.splitext(os.path.basename(filepath))
body = splitter[0]
extention = splitter[1]
newpath = filepath
counter = 1
while os.path.exists(newpath):
match = re.search(PATTERN, body)
if match:
counter = int(match[2]) + 1
num_with_bracket = f'({str(counter)})'
body = f'{match[1]}{num_with_bracket}'
else:
body = f'{body}({str(counter)})'
newpath = os.path.join(os.path.dirname(filepath), body + extention)
return newpath

View File

@@ -8,6 +8,9 @@ YT_VIDEO_ID_LENGTH = 11
def extract_video_id(url_or_id: str) -> str: def extract_video_id(url_or_id: str) -> str:
ret = '' ret = ''
if '[' in url_or_id:
url_or_id = url_or_id.replace('[', '').replace(']', '')
if type(url_or_id) != str: if type(url_or_id) != str:
raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.") raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.")
if len(url_or_id) == YT_VIDEO_ID_LENGTH: if len(url_or_id) == YT_VIDEO_ID_LENGTH: