Compare commits

...

10 Commits

Author SHA1 Message Date
taizan-hokuto
e29b3b8377 Merge branch 'hotfix/network' 2020-09-14 00:40:40 +09:00
taizan-hokuto
0859ed5fb1 Increment version 2020-09-14 00:29:21 +09:00
taizan-hokuto
a80d5ba080 Fix handling network error 2020-09-14 00:28:41 +09:00
taizan-hokuto
b7e6043a71 Merge branch 'hotfix/memory' 2020-09-12 02:12:46 +09:00
taizan-hokuto
820ba35013 Increment version 2020-09-12 02:02:07 +09:00
taizan-hokuto
ecd2d130bf Clear set each time the extraction changes 2020-09-12 01:57:55 +09:00
taizan-hokuto
f77a2c889b Merge branch 'hotfix/not_quit' 2020-09-12 00:57:48 +09:00
taizan-hokuto
47d5ab288f Increment version 2020-09-12 00:49:37 +09:00
taizan-hokuto
5f53fd24dd Format 2020-09-12 00:48:40 +09:00
taizan-hokuto
11a9d0e2d7 Fix a problem with extraction not completing 2020-09-12 00:42:30 +09:00
8 changed files with 72 additions and 17 deletions

View File

@@ -2,7 +2,7 @@
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
"""
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
__version__ = '0.2.2'
__version__ = '0.2.5'
__license__ = 'MIT'
__author__ = 'taizan-hokuto'
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'

View File

@@ -5,9 +5,10 @@ import signal
import time
from json.decoder import JSONDecodeError
from pathlib import Path
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
from .arguments import Arguments
from .progressbar import ProgressBar
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
from .. processors.html_archiver import HTMLArchiver
from .. tool.extract.extractor import Extractor
from .. tool.videoinfo import VideoInfo
@@ -50,6 +51,9 @@ def main():
for counter, video_id in enumerate(Arguments().video_ids):
if '[' in video_id:
video_id = video_id.replace('[', '').replace(']', '')
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
try:
video_id = extract_video_id(video_id)
if os.path.exists(Arguments().output):
@@ -57,8 +61,8 @@ def main():
else:
raise FileNotFoundError
err = None
for _ in range(3): # retry 3 times
try:
for _ in range(3): # retry 3 times
try:
info = VideoInfo(video_id)
break
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
@@ -71,8 +75,6 @@ def main():
util.save(err.doc, "ERR", ".dat")
continue
if len(Arguments().video_ids) > 1:
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
print(f"\n"
f" video_id: {video_id}\n"
f" channel: {info.get_channel_name()}\n"
@@ -81,7 +83,7 @@ def main():
print(f" output path: {path.resolve()}")
duration = info.get_duration()
pbar = ProgressBar(total=(duration * 1000), status="Extracting")
ex = Extractor(video_id,
ex = Extractor(video_id,
callback=pbar._disp,
div=10)
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
@@ -112,6 +114,12 @@ def main():
print("JSONDecodeError.:{}".format(video_id))
if Arguments().save_error_data:
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
except PatternUnmatchError:
print(f"PatternUnmatchError [{video_id}]. ")
except Exception as e:
print(type(e), str(e))
return

View File

@@ -43,7 +43,6 @@ class InvalidVideoIdException(Exception):
self.doc = doc
class UnknownConnectionError(Exception):
pass

View File

@@ -1,9 +1,12 @@
import httpx
import os
import re
import httpx
import time
from base64 import standard_b64encode
from httpx import NetworkError, ReadTimeout
from .chat_processor import ChatProcessor
from .default.processor import DefaultProcessor
from ..exceptions import UnknownConnectionError
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
@@ -112,7 +115,17 @@ class HTMLArchiver(ChatProcessor):
for item in message_items)
def _encode_img(self, url):
resp = httpx.get(url)
err = None
for _ in range(3):
try:
resp = httpx.get(url)
break
except (NetworkError, ReadTimeout) as e:
err = e
time.sleep(3)
else:
raise UnknownConnectionError(str(err))
return standard_b64encode(resp.content).decode()
def _set_emoji_table(self, item: dict):

View File

@@ -8,14 +8,19 @@ from ... import config
from ... paramgen import arcparam
from ... exceptions import UnknownConnectionError
from concurrent.futures import CancelledError
from httpx import NetworkError, ReadTimeout
from json import JSONDecodeError
from urllib.parse import quote
headers = config.headers
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
"get_live_chat_replay?continuation="
MAX_RETRY_COUNT = 3
# Set to avoid duplicate parameters
param_set = set()
def _split(start, end, count, min_interval_sec=120):
"""
@@ -50,6 +55,7 @@ def _split(start, end, count, min_interval_sec=120):
def ready_blocks(video_id, duration, div, callback):
param_set.clear()
if div <= 0:
raise ValueError
@@ -62,16 +68,24 @@ def ready_blocks(video_id, duration, div, callback):
async def _create_block(session, video_id, seektime, callback):
continuation = arcparam.getparam(video_id, seektime=seektime)
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None
for _ in range(MAX_RETRY_COUNT):
try:
if continuation in param_set:
next_continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=headers)
next_continuation, actions = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
err = e
await asyncio.sleep(3)
else:
cancel()
raise UnknownConnectionError("Abort: Unknown connection error.")
raise UnknownConnectionError("Abort:" + str(err))
if actions:
first = parser.get_offset(actions[0])
@@ -110,16 +124,24 @@ def fetch_patch(callback, blocks, video_id):
async def _fetch(continuation, session) -> Patch:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
err = None
for _ in range(MAX_RETRY_COUNT):
try:
if continuation in param_set:
continuation, actions = None, []
break
param_set.add(continuation)
resp = await session.get(url, headers=config.headers)
continuation, actions = parser.parse(resp.json())
break
except JSONDecodeError:
await asyncio.sleep(3)
except (NetworkError, ReadTimeout) as e:
err = e
await asyncio.sleep(3)
else:
cancel()
raise UnknownConnectionError("Abort: Unknown connection error.")
raise UnknownConnectionError("Abort:" + str(err))
if actions:
last = parser.get_offset(actions[-1])

View File

@@ -93,4 +93,5 @@ class Extractor:
return ret
def cancel(self):
print("cancel")
asyncdl.cancel()

View File

@@ -7,7 +7,6 @@ from typing import Tuple
class ExtractWorker:
"""
ExtractWorker associates a download session with a block.
When the worker finishes fetching, the block
being fetched is splitted and assigned the free worker.

View File

@@ -1,10 +1,13 @@
import httpx
import json
import re
import httpx
import time
from httpx import ConnectError, NetworkError
from .. import config
from ..exceptions import InvalidVideoIdException, PatternUnmatchError
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
from ..util.extract_video_id import extract_video_id
headers = config.headers
pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})")
@@ -85,8 +88,18 @@ class VideoInfo:
def _get_page_text(self, video_id):
url = f"https://www.youtube.com/embed/{video_id}"
resp = httpx.get(url, headers=headers)
resp.raise_for_status()
err = None
for _ in range(3):
try:
resp = httpx.get(url, headers=headers)
resp.raise_for_status()
break
except (ConnectError, NetworkError) as e:
err = e
time.sleep(3)
else:
raise UnknownConnectionError(str(err))
return resp.text
def _parse(self, text):