Compare commits
25 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3106b3e545 | ||
|
|
50816a661d | ||
|
|
6755bc8bb2 | ||
|
|
26be989b9b | ||
|
|
73ad0a1f44 | ||
|
|
66b185ebf7 | ||
|
|
71650c39f7 | ||
|
|
488445c73b | ||
|
|
075e811efe | ||
|
|
58d9bf7fdb | ||
|
|
b3e6275de7 | ||
|
|
748778f545 | ||
|
|
e29b3b8377 | ||
|
|
0859ed5fb1 | ||
|
|
a80d5ba080 | ||
|
|
b7e6043a71 | ||
|
|
820ba35013 | ||
|
|
ecd2d130bf | ||
|
|
f77a2c889b | ||
|
|
47d5ab288f | ||
|
|
5f53fd24dd | ||
|
|
11a9d0e2d7 | ||
|
|
480c9e15b8 | ||
|
|
35aa7636f6 | ||
|
|
8fee67c2d4 |
@@ -2,7 +2,7 @@
|
||||
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
|
||||
"""
|
||||
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
|
||||
__version__ = '0.2.1'
|
||||
__version__ = '0.3.0'
|
||||
__license__ = 'MIT'
|
||||
__author__ = 'taizan-hokuto'
|
||||
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'
|
||||
|
||||
@@ -2,11 +2,13 @@ import argparse
|
||||
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from json.decoder import JSONDecodeError
|
||||
from pathlib import Path
|
||||
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
|
||||
from .arguments import Arguments
|
||||
from .progressbar import ProgressBar
|
||||
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
|
||||
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
|
||||
from .. processors.html_archiver import HTMLArchiver
|
||||
from .. tool.extract.extractor import Extractor
|
||||
from .. tool.videoinfo import VideoInfo
|
||||
@@ -49,21 +51,36 @@ def main():
|
||||
for counter, video_id in enumerate(Arguments().video_ids):
|
||||
if '[' in video_id:
|
||||
video_id = video_id.replace('[', '').replace(']', '')
|
||||
if len(Arguments().video_ids) > 1:
|
||||
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
|
||||
|
||||
try:
|
||||
video_id = extract_video_id(video_id)
|
||||
if os.path.exists(Arguments().output):
|
||||
path = Path(Arguments().output + video_id + '.html')
|
||||
else:
|
||||
if not os.path.exists(Arguments().output):
|
||||
raise FileNotFoundError
|
||||
info = VideoInfo(video_id)
|
||||
if len(Arguments().video_ids) > 1:
|
||||
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
|
||||
separated_path = str(Path(Arguments().output)) + os.path.sep
|
||||
path = util.checkpath(separated_path + video_id + '.html')
|
||||
err = None
|
||||
for _ in range(3): # retry 3 times
|
||||
try:
|
||||
info = VideoInfo(video_id)
|
||||
break
|
||||
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
|
||||
err = e
|
||||
time.sleep(2)
|
||||
continue
|
||||
else:
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(err.doc, "ERR", ".dat")
|
||||
continue
|
||||
|
||||
print(f"\n"
|
||||
f" video_id: {video_id}\n"
|
||||
f" channel: {info.get_channel_name()}\n"
|
||||
f" title: {info.get_title()}")
|
||||
|
||||
print(f" output path: {path.resolve()}")
|
||||
print(f" output path: {path}")
|
||||
duration = info.get_duration()
|
||||
pbar = ProgressBar(total=(duration * 1000), status="Extracting")
|
||||
ex = Extractor(video_id,
|
||||
@@ -74,7 +91,7 @@ def main():
|
||||
if data == []:
|
||||
return False
|
||||
pbar.reset("#", "=", total=len(data), status="Rendering ")
|
||||
processor = HTMLArchiver(Arguments().output + video_id + '.html', callback=pbar._disp)
|
||||
processor = HTMLArchiver(path, callback=pbar._disp)
|
||||
processor.process(
|
||||
[{'video_id': None,
|
||||
'timeout': 1,
|
||||
@@ -86,8 +103,6 @@ def main():
|
||||
print()
|
||||
if pbar.is_cancelled():
|
||||
print("\nThe extraction process has been discontinued.\n")
|
||||
|
||||
|
||||
except InvalidVideoIdException:
|
||||
print("Invalid Video ID or URL:", video_id)
|
||||
except NoContents as e:
|
||||
@@ -96,14 +111,15 @@ def main():
|
||||
print("The specified directory does not exist.:{}".format(Arguments().output))
|
||||
except JSONDecodeError as e:
|
||||
print(e.msg)
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
print("JSONDecodeError.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
|
||||
except PatternUnmatchError as e:
|
||||
print(e.msg)
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(e.doc, "ERR_PATTERN_UNMATCH", ".dat")
|
||||
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
|
||||
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
|
||||
except PatternUnmatchError:
|
||||
print(f"PatternUnmatchError [{video_id}]. ")
|
||||
except Exception as e:
|
||||
print(type(e), str(e))
|
||||
|
||||
return
|
||||
|
||||
|
||||
@@ -38,7 +38,9 @@ class InvalidVideoIdException(Exception):
|
||||
'''
|
||||
Thrown when the video_id is not exist (VideoInfo).
|
||||
'''
|
||||
pass
|
||||
def __init__(self, doc):
|
||||
self.msg = "InvalidVideoIdException"
|
||||
self.doc = doc
|
||||
|
||||
|
||||
class UnknownConnectionError(Exception):
|
||||
@@ -47,7 +49,7 @@ class UnknownConnectionError(Exception):
|
||||
|
||||
class RetryExceedMaxCount(Exception):
|
||||
'''
|
||||
thrown when the number of retries exceeds the maximum value.
|
||||
Thrown when the number of retries exceeds the maximum value.
|
||||
'''
|
||||
pass
|
||||
|
||||
@@ -66,13 +68,13 @@ class FailedExtractContinuation(ChatDataFinished):
|
||||
|
||||
class VideoInfoParseError(Exception):
|
||||
'''
|
||||
thrown when failed to parse video info
|
||||
Base exception when parsing video info.
|
||||
'''
|
||||
|
||||
|
||||
class PatternUnmatchError(VideoInfoParseError):
|
||||
'''
|
||||
thrown when failed to parse video info with unmatched pattern
|
||||
Thrown when failed to parse video info with unmatched pattern.
|
||||
'''
|
||||
def __init__(self, doc):
|
||||
self.msg = "PatternUnmatchError"
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import httpx
|
||||
import os
|
||||
import re
|
||||
import httpx
|
||||
import time
|
||||
from base64 import standard_b64encode
|
||||
from httpx import NetworkError, ReadTimeout
|
||||
from .chat_processor import ChatProcessor
|
||||
from .default.processor import DefaultProcessor
|
||||
from ..exceptions import UnknownConnectionError
|
||||
|
||||
|
||||
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
|
||||
@@ -112,7 +115,18 @@ class HTMLArchiver(ChatProcessor):
|
||||
for item in message_items)
|
||||
|
||||
def _encode_img(self, url):
|
||||
resp = httpx.get(url)
|
||||
err = None
|
||||
for _ in range(5):
|
||||
try:
|
||||
resp = httpx.get(url, timeout=30)
|
||||
break
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
print("Network Error. retrying...")
|
||||
err = e
|
||||
time.sleep(3)
|
||||
else:
|
||||
raise UnknownConnectionError(str(err))
|
||||
|
||||
return standard_b64encode(resp.content).decode()
|
||||
|
||||
def _set_emoji_table(self, item: dict):
|
||||
|
||||
@@ -8,14 +8,19 @@ from ... import config
|
||||
from ... paramgen import arcparam
|
||||
from ... exceptions import UnknownConnectionError
|
||||
from concurrent.futures import CancelledError
|
||||
from httpx import NetworkError, ReadTimeout
|
||||
from json import JSONDecodeError
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
headers = config.headers
|
||||
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
|
||||
"get_live_chat_replay?continuation="
|
||||
MAX_RETRY_COUNT = 3
|
||||
|
||||
# Set to avoid duplicate parameters
|
||||
param_set = set()
|
||||
|
||||
|
||||
def _split(start, end, count, min_interval_sec=120):
|
||||
"""
|
||||
@@ -50,6 +55,7 @@ def _split(start, end, count, min_interval_sec=120):
|
||||
|
||||
|
||||
def ready_blocks(video_id, duration, div, callback):
|
||||
param_set.clear()
|
||||
if div <= 0:
|
||||
raise ValueError
|
||||
|
||||
@@ -62,16 +68,24 @@ def ready_blocks(video_id, duration, div, callback):
|
||||
async def _create_block(session, video_id, seektime, callback):
|
||||
continuation = arcparam.getparam(video_id, seektime=seektime)
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
err = None
|
||||
for _ in range(MAX_RETRY_COUNT):
|
||||
try:
|
||||
if continuation in param_set:
|
||||
next_continuation, actions = None, []
|
||||
break
|
||||
param_set.add(continuation)
|
||||
resp = await session.get(url, headers=headers)
|
||||
next_continuation, actions = parser.parse(resp.json())
|
||||
break
|
||||
except JSONDecodeError:
|
||||
await asyncio.sleep(3)
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
err = e
|
||||
await asyncio.sleep(3)
|
||||
else:
|
||||
cancel()
|
||||
raise UnknownConnectionError("Abort: Unknown connection error.")
|
||||
raise UnknownConnectionError("Abort:" + str(err))
|
||||
|
||||
if actions:
|
||||
first = parser.get_offset(actions[0])
|
||||
@@ -110,16 +124,24 @@ def fetch_patch(callback, blocks, video_id):
|
||||
|
||||
async def _fetch(continuation, session) -> Patch:
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
err = None
|
||||
for _ in range(MAX_RETRY_COUNT):
|
||||
try:
|
||||
if continuation in param_set:
|
||||
continuation, actions = None, []
|
||||
break
|
||||
param_set.add(continuation)
|
||||
resp = await session.get(url, headers=config.headers)
|
||||
continuation, actions = parser.parse(resp.json())
|
||||
break
|
||||
except JSONDecodeError:
|
||||
await asyncio.sleep(3)
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
err = e
|
||||
await asyncio.sleep(3)
|
||||
else:
|
||||
cancel()
|
||||
raise UnknownConnectionError("Abort: Unknown connection error.")
|
||||
raise UnknownConnectionError("Abort:" + str(err))
|
||||
|
||||
if actions:
|
||||
last = parser.get_offset(actions[-1])
|
||||
|
||||
@@ -93,4 +93,5 @@ class Extractor:
|
||||
return ret
|
||||
|
||||
def cancel(self):
|
||||
print("cancel")
|
||||
asyncdl.cancel()
|
||||
|
||||
@@ -7,7 +7,6 @@ from typing import Tuple
|
||||
class ExtractWorker:
|
||||
"""
|
||||
ExtractWorker associates a download session with a block.
|
||||
|
||||
When the worker finishes fetching, the block
|
||||
being fetched is splitted and assigned the free worker.
|
||||
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import httpx
|
||||
import json
|
||||
import re
|
||||
import httpx
|
||||
import time
|
||||
from httpx import ConnectError, NetworkError
|
||||
from .. import config
|
||||
from ..exceptions import InvalidVideoIdException, PatternUnmatchError
|
||||
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
|
||||
from ..util.extract_video_id import extract_video_id
|
||||
|
||||
|
||||
headers = config.headers
|
||||
|
||||
pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})")
|
||||
pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})")
|
||||
|
||||
item_channel_id = [
|
||||
"videoDetails",
|
||||
@@ -80,19 +83,37 @@ class VideoInfo:
|
||||
|
||||
def __init__(self, video_id):
|
||||
self.video_id = extract_video_id(video_id)
|
||||
text = self._get_page_text(self.video_id)
|
||||
self._parse(text)
|
||||
for _ in range(3):
|
||||
try:
|
||||
text = self._get_page_text(self.video_id)
|
||||
self._parse(text)
|
||||
break
|
||||
except PatternUnmatchError:
|
||||
time.sleep(2)
|
||||
pass
|
||||
else:
|
||||
raise PatternUnmatchError("Pattern Unmatch")
|
||||
|
||||
def _get_page_text(self, video_id):
|
||||
url = f"https://www.youtube.com/embed/{video_id}"
|
||||
resp = httpx.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
err = None
|
||||
for _ in range(3):
|
||||
try:
|
||||
resp = httpx.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
break
|
||||
except (ConnectError, NetworkError) as e:
|
||||
err = e
|
||||
time.sleep(3)
|
||||
else:
|
||||
raise UnknownConnectionError(str(err))
|
||||
|
||||
return resp.text
|
||||
|
||||
def _parse(self, text):
|
||||
result = re.search(pattern, text)
|
||||
if result is None:
|
||||
raise PatternUnmatchError(text)
|
||||
raise PatternUnmatchError()
|
||||
decoder = json.JSONDecoder()
|
||||
res = decoder.raw_decode(result.group(1)[:-1])[0]
|
||||
response = self._get_item(res, item_response)
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import datetime
|
||||
import httpx
|
||||
import json
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
from .. import config
|
||||
|
||||
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
|
||||
|
||||
|
||||
def extract(url):
|
||||
_session = httpx.Client(http2=True)
|
||||
@@ -16,3 +20,21 @@ def save(data, filename, extention):
|
||||
with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention,
|
||||
mode='w', encoding='utf-8') as f:
|
||||
f.writelines(data)
|
||||
|
||||
|
||||
def checkpath(filepath):
|
||||
splitter = os.path.splitext(os.path.basename(filepath))
|
||||
body = splitter[0]
|
||||
extention = splitter[1]
|
||||
newpath = filepath
|
||||
counter = 1
|
||||
while os.path.exists(newpath):
|
||||
match = re.search(PATTERN, body)
|
||||
if match:
|
||||
counter = int(match[2]) + 1
|
||||
num_with_bracket = f'({str(counter)})'
|
||||
body = f'{match[1]}{num_with_bracket}'
|
||||
else:
|
||||
body = f'{body}({str(counter)})'
|
||||
newpath = os.path.join(os.path.dirname(filepath), body + extention)
|
||||
return newpath
|
||||
|
||||
Reference in New Issue
Block a user