Compare commits
39 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71650c39f7 | ||
|
|
488445c73b | ||
|
|
075e811efe | ||
|
|
58d9bf7fdb | ||
|
|
b3e6275de7 | ||
|
|
748778f545 | ||
|
|
e29b3b8377 | ||
|
|
0859ed5fb1 | ||
|
|
a80d5ba080 | ||
|
|
b7e6043a71 | ||
|
|
820ba35013 | ||
|
|
ecd2d130bf | ||
|
|
f77a2c889b | ||
|
|
47d5ab288f | ||
|
|
5f53fd24dd | ||
|
|
11a9d0e2d7 | ||
|
|
480c9e15b8 | ||
|
|
35aa7636f6 | ||
|
|
8fee67c2d4 | ||
|
|
d3f1643a40 | ||
|
|
eb29f27493 | ||
|
|
8adf75ab83 | ||
|
|
2e05803d75 | ||
|
|
f16c0ee73a | ||
|
|
a338f2b782 | ||
|
|
864ccddfd7 | ||
|
|
339df69e36 | ||
|
|
76a5b0cd18 | ||
|
|
be0ab2431b | ||
|
|
2edb60c592 | ||
|
|
2c6c3a1ca3 | ||
|
|
4be540793d | ||
|
|
08b86fe596 | ||
|
|
157f3b9952 | ||
|
|
8f3ca2662a | ||
|
|
c4b015861c | ||
|
|
3aa413d59e | ||
|
|
03ba285a16 | ||
|
|
5fe0ee5aa8 |
@@ -2,7 +2,7 @@
|
||||
pytchat is a lightweight python library to browse youtube livechat without Selenium or BeautifulSoup.
|
||||
"""
|
||||
__copyright__ = 'Copyright (C) 2019 taizan-hokuto'
|
||||
__version__ = '0.1.8'
|
||||
__version__ = '0.2.7'
|
||||
__license__ = 'MIT'
|
||||
__author__ = 'taizan-hokuto'
|
||||
__author_email__ = '55448286+taizan-hokuto@users.noreply.github.com'
|
||||
|
||||
@@ -2,11 +2,13 @@ import argparse
|
||||
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from json.decoder import JSONDecodeError
|
||||
from pathlib import Path
|
||||
from httpcore import ReadTimeout as HCReadTimeout, NetworkError as HCNetworkError
|
||||
from .arguments import Arguments
|
||||
from .progressbar import ProgressBar
|
||||
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError
|
||||
from .. exceptions import InvalidVideoIdException, NoContents, PatternUnmatchError, UnknownConnectionError
|
||||
from .. processors.html_archiver import HTMLArchiver
|
||||
from .. tool.extract.extractor import Extractor
|
||||
from .. tool.videoinfo import VideoInfo
|
||||
@@ -32,11 +34,12 @@ def main():
|
||||
'If ID starts with a hyphen (-), enclose the ID in square brackets.')
|
||||
parser.add_argument('-o', f'--{Arguments.Name.OUTPUT}', type=str,
|
||||
help='Output directory (end with "/"). default="./"', default='./')
|
||||
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
|
||||
help='Show version')
|
||||
parser.add_argument(f'--{Arguments.Name.SAVE_ERROR_DATA}', action='store_true',
|
||||
help='Save error data when error occurs(".dat" file)')
|
||||
parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true',
|
||||
help='Show version')
|
||||
Arguments(parser.parse_args().__dict__)
|
||||
|
||||
if Arguments().print_version:
|
||||
print(f'pytchat v{__version__} © 2019 taizan-hokuto')
|
||||
return
|
||||
@@ -45,25 +48,44 @@ def main():
|
||||
if not Arguments().video_ids:
|
||||
parser.print_help()
|
||||
return
|
||||
for video_id in Arguments().video_ids:
|
||||
for counter, video_id in enumerate(Arguments().video_ids):
|
||||
if '[' in video_id:
|
||||
video_id = video_id.replace('[', '').replace(']', '')
|
||||
if len(Arguments().video_ids) > 1:
|
||||
print(f"\n{'-' * 10} video:{counter + 1} of {len(Arguments().video_ids)} {'-' * 10}")
|
||||
|
||||
try:
|
||||
video_id = extract_video_id(video_id)
|
||||
if os.path.exists(Arguments().output):
|
||||
path = Path(Arguments().output + video_id + '.html')
|
||||
if Arguments().output[-1] != "/" or Arguments().output[-1] != "\\":
|
||||
Arguments().output = '/'.join([Arguments().output, os.path.sep])
|
||||
path = util.checkpath(Path.resolve(Path(Arguments().output + video_id + '.html')))
|
||||
else:
|
||||
raise FileNotFoundError
|
||||
info = VideoInfo(video_id)
|
||||
print(f"Extracting...\n"
|
||||
err = None
|
||||
for _ in range(3): # retry 3 times
|
||||
try:
|
||||
info = VideoInfo(video_id)
|
||||
break
|
||||
except (PatternUnmatchError, JSONDecodeError, InvalidVideoIdException) as e:
|
||||
err = e
|
||||
time.sleep(2)
|
||||
continue
|
||||
else:
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(err.doc, "ERR", ".dat")
|
||||
continue
|
||||
|
||||
print(f"\n"
|
||||
f" video_id: {video_id}\n"
|
||||
f" channel: {info.get_channel_name()}\n"
|
||||
f" title: {info.get_title()}")
|
||||
|
||||
print(f" output path: {path.resolve()}")
|
||||
print(f" output path: {path}")
|
||||
duration = info.get_duration()
|
||||
pbar = ProgressBar(total=(duration * 1000) / 0.99, status="Extracting")
|
||||
ex = Extractor(video_id,
|
||||
pbar = ProgressBar(total=(duration * 1000), status="Extracting")
|
||||
ex = Extractor(video_id,
|
||||
callback=pbar._disp,
|
||||
div=10)
|
||||
signal.signal(signal.SIGINT, (lambda a, b: cancel(ex, pbar)))
|
||||
@@ -83,31 +105,27 @@ def main():
|
||||
print()
|
||||
if pbar.is_cancelled():
|
||||
print("\nThe extraction process has been discontinued.\n")
|
||||
return False
|
||||
return True
|
||||
|
||||
except InvalidVideoIdException:
|
||||
print("Invalid Video ID or URL:", video_id)
|
||||
except TypeError as e:
|
||||
print(e.with_traceback())
|
||||
except NoContents as e:
|
||||
print(e)
|
||||
except FileNotFoundError:
|
||||
print("The specified directory does not exist.:{}".format(Arguments().output))
|
||||
except JSONDecodeError as e:
|
||||
print(e.msg)
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
print("JSONDecodeError.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(e.doc, "ERR_JSON_DECODE", ".dat")
|
||||
except PatternUnmatchError as e:
|
||||
print(e.msg)
|
||||
print("Cannot parse video information.:{}".format(video_id))
|
||||
if Arguments().save_error_data:
|
||||
util.save(e.doc, "ERR_PATTERN_UNMATCH", ".dat")
|
||||
except (UnknownConnectionError, HCNetworkError, HCReadTimeout) as e:
|
||||
print(f"An unknown network error occurred during the processing of [{video_id}]. : " + str(e))
|
||||
except PatternUnmatchError:
|
||||
print(f"PatternUnmatchError [{video_id}]. ")
|
||||
except Exception as e:
|
||||
print(type(e), str(e))
|
||||
|
||||
return
|
||||
|
||||
|
||||
def cancel(ex: Extractor, pbar: ProgressBar):
|
||||
def cancel(ex, pbar):
|
||||
ex.cancel()
|
||||
pbar.cancel()
|
||||
|
||||
@@ -36,6 +36,7 @@ class Arguments(metaclass=Singleton):
|
||||
self.output: str = arguments[Arguments.Name.OUTPUT]
|
||||
self.video_ids: List[int] = []
|
||||
self.save_error_data: bool = arguments[Arguments.Name.SAVE_ERROR_DATA]
|
||||
|
||||
# Videos
|
||||
if arguments[Arguments.Name.VIDEO_IDS]:
|
||||
self.video_ids = [video_id
|
||||
|
||||
@@ -4,10 +4,9 @@ vladignatyev/progress.py
|
||||
https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
|
||||
(MIT License)
|
||||
'''
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
ROT = ['\u25F4', '\u25F5', '\u25F6', '\u25F7']
|
||||
|
||||
|
||||
class ProgressBar:
|
||||
def __init__(self, total, status):
|
||||
@@ -15,8 +14,9 @@ class ProgressBar:
|
||||
self._cancelled = False
|
||||
self.reset(total=total, status=status)
|
||||
self._blinker = 0
|
||||
|
||||
|
||||
def reset(self, symbol_done="=", symbol_space=" ", total=100, status=''):
|
||||
self.con_width = shutil.get_terminal_size(fallback=(80, 24)).columns
|
||||
self._symbol_done = symbol_done
|
||||
self._symbol_space = symbol_space
|
||||
self._total = total
|
||||
@@ -39,7 +39,9 @@ class ProgressBar:
|
||||
|
||||
bar = self._symbol_done * filled_len + \
|
||||
self._symbol_space * (self._bar_len - filled_len)
|
||||
sys.stdout.write(' [%s] %s%s ...%s %s \r' % (bar, percents, '%', self._status, ROT[self._blinker % 4]))
|
||||
disp = f" [{bar}] {percents:>5.1f}% ...{self._status} "[:self.con_width - 1] + '\r'
|
||||
|
||||
sys.stdout.write(disp)
|
||||
sys.stdout.flush()
|
||||
self._blinker += 1
|
||||
|
||||
|
||||
@@ -38,7 +38,9 @@ class InvalidVideoIdException(Exception):
|
||||
'''
|
||||
Thrown when the video_id is not exist (VideoInfo).
|
||||
'''
|
||||
pass
|
||||
def __init__(self, doc):
|
||||
self.msg = "InvalidVideoIdException"
|
||||
self.doc = doc
|
||||
|
||||
|
||||
class UnknownConnectionError(Exception):
|
||||
@@ -47,7 +49,7 @@ class UnknownConnectionError(Exception):
|
||||
|
||||
class RetryExceedMaxCount(Exception):
|
||||
'''
|
||||
thrown when the number of retries exceeds the maximum value.
|
||||
Thrown when the number of retries exceeds the maximum value.
|
||||
'''
|
||||
pass
|
||||
|
||||
@@ -66,13 +68,13 @@ class FailedExtractContinuation(ChatDataFinished):
|
||||
|
||||
class VideoInfoParseError(Exception):
|
||||
'''
|
||||
thrown when failed to parse video info
|
||||
Base exception when parsing video info.
|
||||
'''
|
||||
|
||||
|
||||
class PatternUnmatchError(VideoInfoParseError):
|
||||
'''
|
||||
thrown when failed to parse video info with unmatched pattern
|
||||
Thrown when failed to parse video info with unmatched pattern.
|
||||
'''
|
||||
def __init__(self, doc):
|
||||
self.msg = "PatternUnmatchError"
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
import httpx
|
||||
import os
|
||||
import re
|
||||
import httpx
|
||||
import time
|
||||
from base64 import standard_b64encode
|
||||
from httpx import NetworkError, ReadTimeout
|
||||
from .chat_processor import ChatProcessor
|
||||
from .default.processor import DefaultProcessor
|
||||
from ..exceptions import UnknownConnectionError
|
||||
|
||||
|
||||
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
|
||||
@@ -43,7 +46,7 @@ class HTMLArchiver(ChatProcessor):
|
||||
'''
|
||||
HTMLArchiver saves chat data as HTML table format.
|
||||
'''
|
||||
def __init__(self, save_path, callback):
|
||||
def __init__(self, save_path, callback=None):
|
||||
super().__init__()
|
||||
self.save_path = self._checkpath(save_path)
|
||||
self.processor = DefaultProcessor()
|
||||
@@ -93,7 +96,8 @@ class HTMLArchiver(ChatProcessor):
|
||||
c.author.channelId)
|
||||
)
|
||||
)
|
||||
self.callback(None, 1)
|
||||
if self.callback:
|
||||
self.callback(None, 1)
|
||||
|
||||
def _parse_html_line(self, raw_line):
|
||||
return ''.join(('<tr>',
|
||||
@@ -111,7 +115,18 @@ class HTMLArchiver(ChatProcessor):
|
||||
for item in message_items)
|
||||
|
||||
def _encode_img(self, url):
|
||||
resp = httpx.get(url)
|
||||
err = None
|
||||
for _ in range(5):
|
||||
try:
|
||||
resp = httpx.get(url, timeout=30)
|
||||
break
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
print("Network Error. retrying...")
|
||||
err = e
|
||||
time.sleep(3)
|
||||
else:
|
||||
raise UnknownConnectionError(str(err))
|
||||
|
||||
return standard_b64encode(resp.content).decode()
|
||||
|
||||
def _set_emoji_table(self, item: dict):
|
||||
|
||||
@@ -8,14 +8,19 @@ from ... import config
|
||||
from ... paramgen import arcparam
|
||||
from ... exceptions import UnknownConnectionError
|
||||
from concurrent.futures import CancelledError
|
||||
from httpx import NetworkError, ReadTimeout
|
||||
from json import JSONDecodeError
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
headers = config.headers
|
||||
REPLAY_URL = "https://www.youtube.com/live_chat_replay/" \
|
||||
"get_live_chat_replay?continuation="
|
||||
MAX_RETRY_COUNT = 3
|
||||
|
||||
# Set to avoid duplicate parameters
|
||||
param_set = set()
|
||||
|
||||
|
||||
def _split(start, end, count, min_interval_sec=120):
|
||||
"""
|
||||
@@ -50,6 +55,7 @@ def _split(start, end, count, min_interval_sec=120):
|
||||
|
||||
|
||||
def ready_blocks(video_id, duration, div, callback):
|
||||
param_set.clear()
|
||||
if div <= 0:
|
||||
raise ValueError
|
||||
|
||||
@@ -62,16 +68,24 @@ def ready_blocks(video_id, duration, div, callback):
|
||||
async def _create_block(session, video_id, seektime, callback):
|
||||
continuation = arcparam.getparam(video_id, seektime=seektime)
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
err = None
|
||||
for _ in range(MAX_RETRY_COUNT):
|
||||
try:
|
||||
if continuation in param_set:
|
||||
next_continuation, actions = None, []
|
||||
break
|
||||
param_set.add(continuation)
|
||||
resp = await session.get(url, headers=headers)
|
||||
next_continuation, actions = parser.parse(resp.json())
|
||||
break
|
||||
except JSONDecodeError:
|
||||
await asyncio.sleep(3)
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
err = e
|
||||
await asyncio.sleep(3)
|
||||
else:
|
||||
cancel()
|
||||
raise UnknownConnectionError("Abort: Unknown connection error.")
|
||||
raise UnknownConnectionError("Abort:" + str(err))
|
||||
|
||||
if actions:
|
||||
first = parser.get_offset(actions[0])
|
||||
@@ -110,16 +124,24 @@ def fetch_patch(callback, blocks, video_id):
|
||||
|
||||
async def _fetch(continuation, session) -> Patch:
|
||||
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
|
||||
err = None
|
||||
for _ in range(MAX_RETRY_COUNT):
|
||||
try:
|
||||
if continuation in param_set:
|
||||
continuation, actions = None, []
|
||||
break
|
||||
param_set.add(continuation)
|
||||
resp = await session.get(url, headers=config.headers)
|
||||
continuation, actions = parser.parse(resp.json())
|
||||
break
|
||||
except JSONDecodeError:
|
||||
await asyncio.sleep(3)
|
||||
except (NetworkError, ReadTimeout) as e:
|
||||
err = e
|
||||
await asyncio.sleep(3)
|
||||
else:
|
||||
cancel()
|
||||
raise UnknownConnectionError("Abort: Unknown connection error.")
|
||||
raise UnknownConnectionError("Abort:" + str(err))
|
||||
|
||||
if actions:
|
||||
last = parser.get_offset(actions[-1])
|
||||
|
||||
@@ -93,4 +93,5 @@ class Extractor:
|
||||
return ret
|
||||
|
||||
def cancel(self):
|
||||
print("cancel")
|
||||
asyncdl.cancel()
|
||||
|
||||
@@ -42,10 +42,14 @@ def get_offset(item):
|
||||
|
||||
|
||||
def get_id(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].values())[0].get('id')
|
||||
a = list(item['replayChatItemAction']["actions"][0].values())[0].get('item')
|
||||
if a:
|
||||
return list(a.values())[0].get('id')
|
||||
return None
|
||||
|
||||
|
||||
def get_type(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].keys())[0]
|
||||
a = list(item['replayChatItemAction']["actions"][0].values())[0].get('item')
|
||||
if a:
|
||||
return list(a.keys())[0]
|
||||
return None
|
||||
|
||||
@@ -7,7 +7,6 @@ from typing import Tuple
|
||||
class ExtractWorker:
|
||||
"""
|
||||
ExtractWorker associates a download session with a block.
|
||||
|
||||
When the worker finishes fetching, the block
|
||||
being fetched is splitted and assigned the free worker.
|
||||
|
||||
|
||||
@@ -1,146 +0,0 @@
|
||||
|
||||
import httpx
|
||||
import asyncio
|
||||
import json
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from . worker import ExtractWorker
|
||||
from . patch import Patch
|
||||
from ... import config
|
||||
from ... paramgen import arcparam_mining as arcparam
|
||||
from concurrent.futures import CancelledError
|
||||
from urllib.parse import quote
|
||||
|
||||
headers = config.headers
|
||||
REPLAY_URL = "https://www.youtube.com/live_chat_replay?continuation="
|
||||
INTERVAL = 1
|
||||
|
||||
|
||||
def _split(start, end, count, min_interval_sec=120):
|
||||
"""
|
||||
Split section from `start` to `end` into `count` pieces,
|
||||
and returns the beginning of each piece.
|
||||
The `count` is adjusted so that the length of each piece
|
||||
is no smaller than `min_interval`.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
List of the offset of each block's first chat data.
|
||||
"""
|
||||
|
||||
if not (isinstance(start, int) or isinstance(start, float)) or \
|
||||
not (isinstance(end, int) or isinstance(end, float)):
|
||||
raise ValueError("start/end must be int or float")
|
||||
if not isinstance(count, int):
|
||||
raise ValueError("count must be int")
|
||||
if start > end:
|
||||
raise ValueError("end must be equal to or greater than start.")
|
||||
if count < 1:
|
||||
raise ValueError("count must be equal to or greater than 1.")
|
||||
if (end - start) / count < min_interval_sec:
|
||||
count = int((end - start) / min_interval_sec)
|
||||
if count == 0:
|
||||
count = 1
|
||||
interval = (end - start) / count
|
||||
|
||||
if count == 1:
|
||||
return [start]
|
||||
return sorted(list(set([int(start + interval * j)
|
||||
for j in range(count)])))
|
||||
|
||||
|
||||
def ready_blocks(video_id, duration, div, callback):
|
||||
if div <= 0:
|
||||
raise ValueError
|
||||
|
||||
async def _get_blocks(video_id, duration, div, callback):
|
||||
async with httpx.ClientSession() as session:
|
||||
tasks = [_create_block(session, video_id, seektime, callback)
|
||||
for seektime in _split(0, duration, div)]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
async def _create_block(session, video_id, seektime, callback):
|
||||
continuation = arcparam.getparam(video_id, seektime=seektime)
|
||||
url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
|
||||
f"{int(seektime*1000)}&hidden=false&pbj=1")
|
||||
async with session.get(url, headers=headers) as resp:
|
||||
chat_json = await resp.text()
|
||||
if chat_json is None:
|
||||
return
|
||||
continuation, actions = parser.parse(json.loads(chat_json)[1])
|
||||
first = seektime
|
||||
seektime += INTERVAL
|
||||
if callback:
|
||||
callback(actions, INTERVAL)
|
||||
return Block(
|
||||
continuation=continuation,
|
||||
chat_data=actions,
|
||||
first=first,
|
||||
last=seektime,
|
||||
seektime=seektime
|
||||
)
|
||||
"""
|
||||
fetch initial blocks.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
blocks = loop.run_until_complete(
|
||||
_get_blocks(video_id, duration, div, callback))
|
||||
return blocks
|
||||
|
||||
|
||||
def fetch_patch(callback, blocks, video_id):
|
||||
|
||||
async def _allocate_workers():
|
||||
workers = [
|
||||
ExtractWorker(
|
||||
fetch=_fetch, block=block,
|
||||
blocks=blocks, video_id=video_id
|
||||
)
|
||||
for block in blocks
|
||||
]
|
||||
async with httpx.ClientSession() as session:
|
||||
tasks = [worker.run(session) for worker in workers]
|
||||
return await asyncio.gather(*tasks)
|
||||
|
||||
async def _fetch(seektime, session) -> Patch:
|
||||
continuation = arcparam.getparam(video_id, seektime=seektime)
|
||||
url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
|
||||
f"{int(seektime*1000)}&hidden=false&pbj=1")
|
||||
async with session.get(url, headers=config.headers) as resp:
|
||||
chat_json = await resp.text()
|
||||
actions = []
|
||||
try:
|
||||
if chat_json is None:
|
||||
return Patch()
|
||||
continuation, actions = parser.parse(json.loads(chat_json)[1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
if callback:
|
||||
callback(actions, INTERVAL)
|
||||
return Patch(chats=actions, continuation=continuation,
|
||||
seektime=seektime, last=seektime)
|
||||
"""
|
||||
allocate workers and assign blocks.
|
||||
"""
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
loop.run_until_complete(_allocate_workers())
|
||||
except CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
async def _shutdown():
|
||||
print("\nshutdown...")
|
||||
tasks = [t for t in asyncio.all_tasks()
|
||||
if t is not asyncio.current_task()]
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
try:
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
def cancel():
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(_shutdown())
|
||||
@@ -1,62 +0,0 @@
|
||||
from . import parser
|
||||
class Block:
|
||||
"""Block object represents something like a box
|
||||
to join chunk of chatdata.
|
||||
|
||||
Parameter:
|
||||
---------
|
||||
first : int :
|
||||
videoOffsetTimeMs of the first chat_data
|
||||
(chat_data[0])
|
||||
|
||||
last : int :
|
||||
videoOffsetTimeMs of the last chat_data.
|
||||
(chat_data[-1])
|
||||
|
||||
this value increases as fetching chatdata progresses.
|
||||
|
||||
end : int :
|
||||
target videoOffsetTimeMs of last chat data for extract,
|
||||
equals to first videoOffsetTimeMs of next block.
|
||||
when extract worker reaches this offset, stop fetching.
|
||||
|
||||
continuation : str :
|
||||
continuation param of last chat data.
|
||||
|
||||
chat_data : list
|
||||
|
||||
done : bool :
|
||||
whether this block has been fetched.
|
||||
|
||||
remaining : int :
|
||||
remaining data to extract.
|
||||
equals end - last.
|
||||
|
||||
is_last : bool :
|
||||
whether this block is the last one in blocklist.
|
||||
|
||||
during_split : bool :
|
||||
whether this block is in the process of during_split.
|
||||
while True, this block is excluded from duplicate split procedure.
|
||||
|
||||
seektime : float :
|
||||
the last position of this block(seconds) already fetched.
|
||||
"""
|
||||
|
||||
__slots__ = ['first','last','end','continuation','chat_data','remaining',
|
||||
'done','is_last','during_split','seektime']
|
||||
|
||||
def __init__(self, first = 0, last = 0, end = 0,
|
||||
continuation = '', chat_data = [], is_last = False,
|
||||
during_split = False, seektime = None):
|
||||
self.first = first
|
||||
self.last = last
|
||||
self.end = end
|
||||
self.continuation = continuation
|
||||
self.chat_data = chat_data
|
||||
self.done = False
|
||||
self.remaining = self.end - self.last
|
||||
self.is_last = is_last
|
||||
self.during_split = during_split
|
||||
self.seektime = seektime
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
import re
|
||||
from ... import config
|
||||
from ... exceptions import (
|
||||
ResponseContextError,
|
||||
NoContents, NoContinuation)
|
||||
|
||||
logger = config.logger(__name__)
|
||||
|
||||
|
||||
def parse(jsn):
|
||||
"""
|
||||
Parse replay chat data.
|
||||
Parameter:
|
||||
----------
|
||||
jsn : dict
|
||||
JSON of replay chat data.
|
||||
Returns:
|
||||
------
|
||||
continuation : str
|
||||
actions : list
|
||||
|
||||
"""
|
||||
if jsn is None:
|
||||
raise ValueError("parameter JSON is None")
|
||||
if jsn['response']['responseContext'].get('errors'):
|
||||
raise ResponseContextError(
|
||||
'video_id is invalid or private/deleted.')
|
||||
contents = jsn["response"].get('continuationContents')
|
||||
if contents is None:
|
||||
raise NoContents('No chat data.')
|
||||
|
||||
cont = contents['liveChatContinuation']['continuations'][0]
|
||||
if cont is None:
|
||||
raise NoContinuation('No Continuation')
|
||||
metadata = cont.get('liveChatReplayContinuationData')
|
||||
if metadata:
|
||||
continuation = metadata.get("continuation")
|
||||
actions = contents['liveChatContinuation'].get('actions')
|
||||
if continuation:
|
||||
return continuation, [action["replayChatItemAction"]["actions"][0]
|
||||
for action in actions
|
||||
if list(action['replayChatItemAction']["actions"][0].values()
|
||||
)[0]['item'].get("liveChatPaidMessageRenderer")
|
||||
or list(action['replayChatItemAction']["actions"][0].values()
|
||||
)[0]['item'].get("liveChatPaidStickerRenderer")
|
||||
]
|
||||
return None, []
|
||||
|
||||
|
||||
def get_offset(item):
|
||||
return int(item['replayChatItemAction']["videoOffsetTimeMsec"])
|
||||
|
||||
|
||||
def get_id(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].values())[0].get('id')
|
||||
|
||||
|
||||
def get_type(item):
|
||||
return list((list(item['replayChatItemAction']["actions"][0].values()
|
||||
)[0])['item'].keys())[0]
|
||||
|
||||
|
||||
_REGEX_YTINIT = re.compile(
|
||||
"window\\[\"ytInitialData\"\\]\\s*=\\s*({.+?});\\s+")
|
||||
|
||||
|
||||
def extract(text):
|
||||
|
||||
match = re.findall(_REGEX_YTINIT, str(text))
|
||||
if match:
|
||||
return match[0]
|
||||
return None
|
||||
@@ -1,27 +0,0 @@
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from typing import NamedTuple
|
||||
|
||||
class Patch(NamedTuple):
|
||||
"""
|
||||
Patch represents chunk of chat data
|
||||
which is fetched by asyncdl.fetch_patch._fetch().
|
||||
"""
|
||||
chats : list = []
|
||||
continuation : str = None
|
||||
seektime : float = None
|
||||
first : int = None
|
||||
last : int = None
|
||||
|
||||
def fill(block:Block, patch:Patch):
|
||||
if patch.last < block.end:
|
||||
set_patch(block, patch)
|
||||
return
|
||||
block.continuation = None
|
||||
|
||||
def set_patch(block:Block, patch:Patch):
|
||||
block.continuation = patch.continuation
|
||||
block.chat_data.extend(patch.chats)
|
||||
block.last = patch.seektime
|
||||
block.seektime = patch.seektime
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
from . import asyncdl
|
||||
from . import parser
|
||||
from .. videoinfo import VideoInfo
|
||||
from ... import config
|
||||
from ... exceptions import InvalidVideoIdException
|
||||
logger = config.logger(__name__)
|
||||
headers=config.headers
|
||||
|
||||
class SuperChatMiner:
|
||||
def __init__(self, video_id, duration, div, callback):
|
||||
if not isinstance(div ,int) or div < 1:
|
||||
raise ValueError('div must be positive integer.')
|
||||
elif div > 10:
|
||||
div = 10
|
||||
if not isinstance(duration ,int) or duration < 1:
|
||||
raise ValueError('duration must be positive integer.')
|
||||
self.video_id = video_id
|
||||
self.duration = duration
|
||||
self.div = div
|
||||
self.callback = callback
|
||||
self.blocks = []
|
||||
|
||||
def _ready_blocks(self):
|
||||
blocks = asyncdl.ready_blocks(
|
||||
self.video_id, self.duration, self.div, self.callback)
|
||||
self.blocks = [block for block in blocks if block is not None]
|
||||
return self
|
||||
|
||||
def _set_block_end(self):
|
||||
for i in range(len(self.blocks)-1):
|
||||
self.blocks[i].end = self.blocks[i+1].first
|
||||
self.blocks[-1].end = self.duration
|
||||
self.blocks[-1].is_last =True
|
||||
return self
|
||||
|
||||
def _download_blocks(self):
|
||||
asyncdl.fetch_patch(self.callback, self.blocks, self.video_id)
|
||||
return self
|
||||
|
||||
def _combine(self):
|
||||
ret = []
|
||||
for block in self.blocks:
|
||||
ret.extend(block.chat_data)
|
||||
return ret
|
||||
|
||||
def extract(self):
|
||||
return (
|
||||
self._ready_blocks()
|
||||
._set_block_end()
|
||||
._download_blocks()
|
||||
._combine()
|
||||
)
|
||||
|
||||
def extract(video_id, div = 1, callback = None, processor = None):
|
||||
duration = 0
|
||||
try:
|
||||
duration = VideoInfo(video_id).get_duration()
|
||||
except InvalidVideoIdException:
|
||||
raise
|
||||
if duration == 0:
|
||||
print("video is live.")
|
||||
return []
|
||||
data = SuperChatMiner(video_id, duration, div, callback).extract()
|
||||
if processor is None:
|
||||
return data
|
||||
return processor.process(
|
||||
[{'video_id':None,'timeout':1,'chatdata' : (action
|
||||
for action in data)}]
|
||||
)
|
||||
|
||||
def cancel():
|
||||
asyncdl.cancel()
|
||||
@@ -1,45 +0,0 @@
|
||||
from . import parser
|
||||
from . block import Block
|
||||
from . patch import Patch, fill
|
||||
from ... paramgen import arcparam
|
||||
INTERVAL = 1
|
||||
class ExtractWorker:
|
||||
"""
|
||||
ExtractWorker associates a download session with a block.
|
||||
|
||||
When the worker finishes fetching, the block
|
||||
being fetched is splitted and assigned the free worker.
|
||||
|
||||
Parameter
|
||||
----------
|
||||
fetch : func :
|
||||
extract function of asyncdl
|
||||
|
||||
block : Block :
|
||||
Block object that includes chat_data
|
||||
|
||||
blocks : list :
|
||||
List of Block(s)
|
||||
|
||||
video_id : str :
|
||||
|
||||
parent_block : Block :
|
||||
the block from which current block is splitted
|
||||
"""
|
||||
__slots__ = ['block', 'fetch', 'blocks', 'video_id', 'parent_block']
|
||||
def __init__(self, fetch, block, blocks, video_id ):
|
||||
self.block:Block = block
|
||||
self.fetch = fetch
|
||||
self.blocks:list = blocks
|
||||
self.video_id:str = video_id
|
||||
self.parent_block:Block = None
|
||||
|
||||
async def run(self, session):
|
||||
while self.block.continuation:
|
||||
patch = await self.fetch(
|
||||
self.block.seektime, session)
|
||||
fill(self.block, patch)
|
||||
self.block.seektime += INTERVAL
|
||||
self.block.done = True
|
||||
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
import httpx
|
||||
import json
|
||||
import re
|
||||
import httpx
|
||||
import time
|
||||
from httpx import ConnectError, NetworkError
|
||||
from .. import config
|
||||
from ..exceptions import InvalidVideoIdException, PatternUnmatchError
|
||||
from ..exceptions import InvalidVideoIdException, PatternUnmatchError, UnknownConnectionError
|
||||
from ..util.extract_video_id import extract_video_id
|
||||
|
||||
headers = config.headers
|
||||
|
||||
pattern = re.compile(r"'PLAYER_CONFIG': ({.*}}})")
|
||||
headers = config.headers
|
||||
|
||||
pattern = re.compile(r"['\"]PLAYER_CONFIG['\"]:\s*({.*})")
|
||||
|
||||
item_channel_id = [
|
||||
"videoDetails",
|
||||
@@ -80,19 +83,37 @@ class VideoInfo:
|
||||
|
||||
def __init__(self, video_id):
|
||||
self.video_id = extract_video_id(video_id)
|
||||
text = self._get_page_text(self.video_id)
|
||||
self._parse(text)
|
||||
for _ in range(3):
|
||||
try:
|
||||
text = self._get_page_text(self.video_id)
|
||||
self._parse(text)
|
||||
break
|
||||
except PatternUnmatchError:
|
||||
time.sleep(2)
|
||||
pass
|
||||
else:
|
||||
raise PatternUnmatchError("Pattern Unmatch")
|
||||
|
||||
def _get_page_text(self, video_id):
|
||||
url = f"https://www.youtube.com/embed/{video_id}"
|
||||
resp = httpx.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
err = None
|
||||
for _ in range(3):
|
||||
try:
|
||||
resp = httpx.get(url, headers=headers)
|
||||
resp.raise_for_status()
|
||||
break
|
||||
except (ConnectError, NetworkError) as e:
|
||||
err = e
|
||||
time.sleep(3)
|
||||
else:
|
||||
raise UnknownConnectionError(str(err))
|
||||
|
||||
return resp.text
|
||||
|
||||
def _parse(self, text):
|
||||
result = re.search(pattern, text)
|
||||
if result is None:
|
||||
raise PatternUnmatchError(text)
|
||||
raise PatternUnmatchError()
|
||||
decoder = json.JSONDecoder()
|
||||
res = decoder.raw_decode(result.group(1)[:-1])[0]
|
||||
response = self._get_item(res, item_response)
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
import datetime
|
||||
import httpx
|
||||
import json
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
from .. import config
|
||||
|
||||
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
|
||||
|
||||
|
||||
def extract(url):
|
||||
_session = httpx.Client(http2=True)
|
||||
@@ -16,3 +20,21 @@ def save(data, filename, extention):
|
||||
with open(filename + "_" + (datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')) + extention,
|
||||
mode='w', encoding='utf-8') as f:
|
||||
f.writelines(data)
|
||||
|
||||
|
||||
def checkpath(filepath):
|
||||
splitter = os.path.splitext(os.path.basename(filepath))
|
||||
body = splitter[0]
|
||||
extention = splitter[1]
|
||||
newpath = filepath
|
||||
counter = 1
|
||||
while os.path.exists(newpath):
|
||||
match = re.search(PATTERN, body)
|
||||
if match:
|
||||
counter = int(match[2]) + 1
|
||||
num_with_bracket = f'({str(counter)})'
|
||||
body = f'{match[1]}{num_with_bracket}'
|
||||
else:
|
||||
body = f'{body}({str(counter)})'
|
||||
newpath = os.path.join(os.path.dirname(filepath), body + extention)
|
||||
return newpath
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
from pytchat.tool.mining import parser
|
||||
import pytchat.config as config
|
||||
import httpx
|
||||
import json
|
||||
from pytchat.paramgen import arcparam_mining as arcparam
|
||||
|
||||
|
||||
def test_arcparam_e(mocker):
|
||||
try:
|
||||
arcparam.getparam("01234567890", -1)
|
||||
assert False
|
||||
except ValueError:
|
||||
assert True
|
||||
|
||||
|
||||
def test_arcparam_0(mocker):
|
||||
param = arcparam.getparam("01234567890", 0)
|
||||
|
||||
assert param == "op2w0wQsGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABYARyAggBeAE%3D"
|
||||
|
||||
|
||||
def test_arcparam_1(mocker):
|
||||
param = arcparam.getparam("01234567890", seektime=100000)
|
||||
print(param)
|
||||
assert param == "op2w0wQzGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABWgUQgMLXL2AEcgIIAXgB"
|
||||
|
||||
|
||||
def test_arcparam_2(mocker):
|
||||
param = arcparam.getparam("PZz9NB0-Z64", 1)
|
||||
url = f"https://www.youtube.com/live_chat_replay?continuation={param}&playerOffsetMs=1000&pbj=1"
|
||||
resp = httpx.Client(http2=True).get(url, headers=config.headers)
|
||||
jsn = json.loads(resp.text)
|
||||
_, chatdata = parser.parse(jsn[1])
|
||||
test_id = chatdata[0]["addChatItemAction"]["item"]["liveChatPaidMessageRenderer"]["id"]
|
||||
print(test_id)
|
||||
assert test_id == "ChwKGkNKSGE0YnFJeWVBQ0ZWcUF3Z0VkdGIwRm9R"
|
||||
|
||||
|
||||
def test_arcparam_3(mocker):
|
||||
param = arcparam.getparam("01234567890")
|
||||
assert param == "op2w0wQsGiBDZzhhRFFvTE1ERXlNelExTmpjNE9UQWdBUSUzRCUzREABYARyAggBeAE%3D"
|
||||
Reference in New Issue
Block a user