diff --git a/README.md b/README.md index 50a26fe..9c2267b 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ pytchat is a python library for fetching youtube live chat. pytchat is a python library for fetching youtube live chat without using youtube api, Selenium or BeautifulSoup. -pytchatはAPIを使わずにYouTubeチャットを取得するためのpythonライブラリです。 +pytchatは、YouTubeチャットを閲覧するためのpythonライブラリです。 Other features: + Customizable [chat data processors](https://github.com/taizan-hokuto/pytchat/wiki/ChatProcessor) including youtube api compatible one. @@ -30,10 +30,9 @@ One-liner command. Save chat data to html, with embedded custom emojis. ```bash -$ pytchat -v ZJ6Q4U_Vg6s -o "c:/temp/" - +$ pytchat -v https://www.youtube.com/watch?v=ZJ6Q4U_Vg6s -o "c:/temp/" # options: -# -v : video_id +# -v : Video ID or URL that includes ID # -o : output directory (default path: './') # saved filename is [video_id].html ``` @@ -43,7 +42,8 @@ $ pytchat -v ZJ6Q4U_Vg6s -o "c:/temp/" ```python from pytchat import LiveChat livechat = LiveChat(video_id = "Zvp1pJpie4I") - +# It is also possible to specify a URL that includes the video ID: +# livechat = LiveChat("https://www.youtube.com/watch?v=Zvp1pJpie4I") while livechat.is_alive(): try: chatdata = livechat.get() diff --git a/pytchat/cli/__init__.py b/pytchat/cli/__init__.py index 696af45..62237ff 100644 --- a/pytchat/cli/__init__.py +++ b/pytchat/cli/__init__.py @@ -1,5 +1,6 @@ import argparse from pathlib import Path +from pytchat.util.extract_video_id import extract_video_id from .arguments import Arguments from .. exceptions import InvalidVideoIdException, NoContents from .. processors.html_archiver import HTMLArchiver @@ -19,16 +20,19 @@ https://github.com/PetterKraabol/Twitch-Chat-Downloader def main(): # Arguments parser = argparse.ArgumentParser(description=f'pytchat v{__version__}') - parser.add_argument('-v', f'--{Arguments.Name.VIDEO}', type=str, - help='Video IDs separated by commas without space.\n' + # parser.add_argument('VideoID_or_URL', type=str, default='__NONE__',nargs='?', + # help='Video ID, or URL that includes id.\n' + # 'If ID starts with a hyphen (-), enclose the ID in square brackets.') + parser.add_argument('-v', f'--{Arguments.Name.VIDEO_IDS}', type=str, + help='Video ID (or URL that includes Video ID). You can specify multiple video IDs by separating them with commas without spaces.\n' 'If ID starts with a hyphen (-), enclose the ID in square brackets.') parser.add_argument('-o', f'--{Arguments.Name.OUTPUT}', type=str, help='Output directory (end with "/"). default="./"', default='./') parser.add_argument(f'--{Arguments.Name.VERSION}', action='store_true', - help='Settings version') + help='Show version') Arguments(parser.parse_args().__dict__) if Arguments().print_version: - print(f'pytchat v{__version__}') + print(f'pytchat v{__version__} © 2019 taizan-hokuto') return # Extractor @@ -43,14 +47,16 @@ def main(): f" channel: {info.get_channel_name()}\n" f" title: {info.get_title()}") path = Path(Arguments().output + video_id + '.html') - print(f"output path: {path.resolve()}") + print(f" output path: {path.resolve()}") Extractor(video_id, processor=HTMLArchiver( Arguments().output + video_id + '.html'), callback=_disp_progress ).extract() print("\nExtraction end.\n") - except (InvalidVideoIdException, NoContents) as e: + except InvalidVideoIdException: + print("Invalid Video ID or URL:", video_id) + except (TypeError, NoContents) as e: print(e) return parser.print_help() diff --git a/pytchat/cli/arguments.py b/pytchat/cli/arguments.py index d6fea2b..6f62548 100644 --- a/pytchat/cli/arguments.py +++ b/pytchat/cli/arguments.py @@ -16,8 +16,8 @@ class Arguments(metaclass=Singleton): class Name: VERSION: str = 'version' - OUTPUT: str = 'output' - VIDEO: str = 'video' + OUTPUT: str = 'output_dir' + VIDEO_IDS: str = 'video_id' def __init__(self, arguments: Optional[Dict[str, Union[str, bool, int]]] = None): @@ -35,6 +35,9 @@ class Arguments(metaclass=Singleton): self.output: str = arguments[Arguments.Name.OUTPUT] self.video_ids: List[int] = [] # Videos - if arguments[Arguments.Name.VIDEO]: + if arguments[Arguments.Name.VIDEO_IDS]: self.video_ids = [video_id - for video_id in arguments[Arguments.Name.VIDEO].split(',')] + for video_id in arguments[Arguments.Name.VIDEO_IDS].split(',')] + + + diff --git a/pytchat/core_async/livechat.py b/pytchat/core_async/livechat.py index 2cc3ff8..17f91f2 100644 --- a/pytchat/core_async/livechat.py +++ b/pytchat/core_async/livechat.py @@ -15,6 +15,7 @@ from .. import exceptions from ..paramgen import liveparam, arcparam from ..processors.default.processor import DefaultProcessor from ..processors.combinator import Combinator +from ..util.extract_video_id import extract_video_id headers = config.headers MAX_RETRY = 10 @@ -86,7 +87,7 @@ class LiveChatAsync: topchat_only=False, logger=config.logger(__name__), ): - self._video_id = video_id + self._video_id = extract_video_id(video_id) self.seektime = seektime if isinstance(processor, tuple): self.processor = Combinator(processor) diff --git a/pytchat/core_multithread/livechat.py b/pytchat/core_multithread/livechat.py index 7f99c55..f439026 100644 --- a/pytchat/core_multithread/livechat.py +++ b/pytchat/core_multithread/livechat.py @@ -14,6 +14,7 @@ from .. import exceptions from ..paramgen import liveparam, arcparam from ..processors.default.processor import DefaultProcessor from ..processors.combinator import Combinator +from ..util.extract_video_id import extract_video_id headers = config.headers MAX_RETRY = 10 @@ -84,7 +85,7 @@ class LiveChat: topchat_only=False, logger=config.logger(__name__) ): - self._video_id = video_id + self._video_id = extract_video_id(video_id) self.seektime = seektime if isinstance(processor, tuple): self.processor = Combinator(processor) diff --git a/pytchat/processors/html_archiver.py b/pytchat/processors/html_archiver.py index dedab39..3676770 100644 --- a/pytchat/processors/html_archiver.py +++ b/pytchat/processors/html_archiver.py @@ -47,7 +47,7 @@ class HTMLArchiver(ChatProcessor): super().__init__() self.save_path = self._checkpath(save_path) self.processor = DefaultProcessor() - self.emoji_table = {} # table for custom emojis. key: emoji_id, value: base64 encoded image binary. + self.emoji_table = {} # tuble for custom emojis. key: emoji_id, value: base64 encoded image binary. self.header = [HEADER_HTML] self.body = ['\n', '\n', self._parse_table_header(fmt_headers)] diff --git a/pytchat/tool/extract/extractor.py b/pytchat/tool/extract/extractor.py index 2b421af..56bd8aa 100644 --- a/pytchat/tool/extract/extractor.py +++ b/pytchat/tool/extract/extractor.py @@ -3,6 +3,7 @@ from . import duplcheck from .. videoinfo import VideoInfo from ... import config from ... exceptions import InvalidVideoIdException +from ... util.extract_video_id import extract_video_id logger = config.logger(__name__) headers = config.headers @@ -14,7 +15,7 @@ class Extractor: raise ValueError('div must be positive integer.') elif div > 10: div = 10 - self.video_id = video_id + self.video_id = extract_video_id(video_id) self.div = div self.callback = callback self.processor = processor diff --git a/pytchat/tool/videoinfo.py b/pytchat/tool/videoinfo.py index 13712dc..a87e0c4 100644 --- a/pytchat/tool/videoinfo.py +++ b/pytchat/tool/videoinfo.py @@ -3,6 +3,7 @@ import re import requests from .. import config from ..exceptions import InvalidVideoIdException +from ..util.extract_video_id import extract_video_id headers = config.headers @@ -78,8 +79,8 @@ class VideoInfo: ''' def __init__(self, video_id): - self.video_id = video_id - text = self._get_page_text(video_id) + self.video_id = extract_video_id(video_id) + text = self._get_page_text(self.video_id) self._parse(text) def _get_page_text(self, video_id): diff --git a/pytchat/util/extract_video_id.py b/pytchat/util/extract_video_id.py new file mode 100644 index 0000000..75385f8 --- /dev/null +++ b/pytchat/util/extract_video_id.py @@ -0,0 +1,25 @@ +import re +from .. exceptions import InvalidVideoIdException + + +PATTERN = re.compile(r"((?<=(v|V)/)|(?<=be/)|(?<=(\?|\&)v=)|(?<=embed/))([\w-]+)") +YT_VIDEO_ID_LENGTH = 11 + + +def extract_video_id(url_or_id: str) -> str: + ret = '' + if type(url_or_id) != str: + raise TypeError(f"{url_or_id}: URL or VideoID must be str, but {type(url_or_id)} is passed.") + if len(url_or_id) == YT_VIDEO_ID_LENGTH: + return url_or_id + match = re.search(PATTERN, url_or_id) + if match is None: + raise InvalidVideoIdException(url_or_id) + try: + ret = match.group(4) + except IndexError: + raise InvalidVideoIdException(url_or_id) + + if ret is None or len(ret) != YT_VIDEO_ID_LENGTH: + raise InvalidVideoIdException(url_or_id) + return ret diff --git a/tests/test_extract_video_id.py b/tests/test_extract_video_id.py new file mode 100644 index 0000000..7d97851 --- /dev/null +++ b/tests/test_extract_video_id.py @@ -0,0 +1,55 @@ +from pytchat.util.extract_video_id import extract_video_id +from pytchat.exceptions import InvalidVideoIdException + +VALID_TEST_PATTERNS = ( + ("ABC_EFG_IJK", "ABC_EFG_IJK"), + ("vid_test_be", "vid_test_be"), + ("https://www.youtube.com/watch?v=123_456_789", "123_456_789"), + ("https://www.youtube.com/watch?v=123_456_789&t=123s", "123_456_789"), + ("www.youtube.com/watch?v=123_456_789", "123_456_789"), + ("watch?v=123_456_789", "123_456_789"), + ("youtube.com/watch?v=123_456_789", "123_456_789"), + ("http://youtu.be/ABC_EFG_IJK", "ABC_EFG_IJK"), + ("youtu.be/ABC_EFG_IJK", "ABC_EFG_IJK"), + ("https://www.youtube.com/watch?v=ABC_EFG_IJK&list=XYZ_ABC_12345&start_radio=1&t=1", "ABC_EFG_IJK"), + ("https://www.youtube.com/embed/ABC_EFG_IJK", "ABC_EFG_IJK"), + ("www.youtube.com/embed/ABC_EFG_IJK", "ABC_EFG_IJK"), + ("youtube.com/embed/ABC_EFG_IJK", "ABC_EFG_IJK") +) + +INVALID_TEST_PATTERNS = ( + ("", ""), + ("0123456789", "0123456789"), # less than 11 letters id + ("more_than_11_letter_string", "more_than_11_letter_string"), + ("https://www.youtube.com/watch?v=more_than_11_letter_string", "more_than_11_letter_string"), + ("https://www.youtube.com/channel/123_456_789", "123_456_789"), +) + +TYPEERROR_TEST_PATTERNS = ( + (100, 100), # not string + (["123_456_789"], "123_456_789"), # not string +) + + +def test_extract_valid_pattern(): + for pattern in VALID_TEST_PATTERNS: + ret = extract_video_id(pattern[0]) + assert ret == pattern[1] + + +def test_extract_invalid_pattern(): + for pattern in INVALID_TEST_PATTERNS: + try: + extract_video_id(pattern[0]) + assert False + except InvalidVideoIdException: + assert True + + +def test_extract_typeerror_pattern(): + for pattern in TYPEERROR_TEST_PATTERNS: + try: + extract_video_id(pattern[0]) + assert False + except TypeError: + assert True diff --git a/tests/test_livechat.py b/tests/test_livechat.py index 6c0d38f..31c7677 100644 --- a/tests/test_livechat.py +++ b/tests/test_livechat.py @@ -11,13 +11,13 @@ def _open_file(path): @aioresponses() def test_Async(*mock): - vid = '' + vid = '__test_id__' _text = _open_file('tests/testdata/paramgen_firstread.json') _text = json.loads(_text) mock[0].get( f"https://www.youtube.com/live_chat?v={vid}&is_popout=1", status=200, body=_text) try: - chat = LiveChatAsync(video_id='') + chat = LiveChatAsync(video_id='__test_id__') assert chat.is_alive() chat.terminate() assert not chat.is_alive() @@ -33,7 +33,7 @@ def test_MultiThread(mocker): responseMock.text = _text mocker.patch('requests.Session.get').return_value = responseMock try: - chat = LiveChatAsync(video_id='') + chat = LiveChatAsync(video_id='__test_id__') assert chat.is_alive() chat.terminate() assert not chat.is_alive() diff --git a/tests/test_livechat_2.py b/tests/test_livechat_2.py index 0fbe42a..42e42c2 100644 --- a/tests/test_livechat_2.py +++ b/tests/test_livechat_2.py @@ -20,7 +20,7 @@ def test_async_live_stream(*mock): r'^https://www.youtube.com/live_chat/get_live_chat\?continuation=.*$') _text = _open_file('tests/testdata/test_stream.json') mock[0].get(pattern, status=200, body=_text) - chat = LiveChatAsync(video_id='', processor=DummyProcessor()) + chat = LiveChatAsync(video_id='__test_id__', processor=DummyProcessor()) chats = await chat.get() rawdata = chats[0]["chatdata"] # assert fetching livachat data @@ -60,7 +60,7 @@ def test_async_replay_stream(*mock): mock[0].get(pattern_live, status=200, body=_text_live) mock[0].get(pattern_replay, status=200, body=_text_replay) - chat = LiveChatAsync(video_id='', processor=DummyProcessor()) + chat = LiveChatAsync(video_id='__test_id__', processor=DummyProcessor()) chats = await chat.get() rawdata = chats[0]["chatdata"] # assert fetching replaychat data @@ -93,7 +93,7 @@ def test_async_force_replay(*mock): mock[0].get(pattern_replay, status=200, body=_text_replay) # force replay chat = LiveChatAsync( - video_id='', processor=DummyProcessor(), force_replay=True) + video_id='__test_id__', processor=DummyProcessor(), force_replay=True) chats = await chat.get() rawdata = chats[0]["chatdata"] # assert fetching replaychat data @@ -119,7 +119,7 @@ def test_multithread_live_stream(mocker): mocker.patch( 'requests.Session.get').return_value.__enter__.return_value = responseMock - chat = LiveChat(video_id='test_id', processor=DummyProcessor()) + chat = LiveChat(video_id='__test_id__', processor=DummyProcessor()) chats = chat.get() rawdata = chats[0]["chatdata"] # assert fetching livachat data diff --git a/tests/test_videoinfo.py b/tests/test_videoinfo.py index 786977b..8a33075 100644 --- a/tests/test_videoinfo.py +++ b/tests/test_videoinfo.py @@ -1,11 +1,12 @@ from pytchat.tool.videoinfo import VideoInfo from pytchat.exceptions import InvalidVideoIdException -import pytest + def _open_file(path): - with open(path,mode ='r',encoding = 'utf-8') as f: + with open(path, mode='r', encoding='utf-8') as f: return f.read() + def _set_test_data(filepath, mocker): _text = _open_file(filepath) response_mock = mocker.Mock() @@ -13,23 +14,25 @@ def _set_test_data(filepath, mocker): response_mock.text = _text mocker.patch('requests.get').return_value = response_mock + def test_archived_page(mocker): _set_test_data('tests/testdata/videoinfo/archived_page.txt', mocker) - info = VideoInfo('test_id') + info = VideoInfo('__test_id__') actual_thumbnail_url = 'https://i.ytimg.com/vi/fzI9FNjXQ0o/hqdefault.jpg' - assert info.video_id == 'test_id' + assert info.video_id == '__test_id__' assert info.get_channel_name() == 'GitHub' assert info.get_thumbnail() == actual_thumbnail_url assert info.get_title() == 'GitHub Arctic Code Vault' assert info.get_channel_id() == 'UC7c3Kb6jYCRj4JOHHZTxKsQ' assert info.get_duration() == 148 + def test_live_page(mocker): _set_test_data('tests/testdata/videoinfo/live_page.txt', mocker) - info = VideoInfo('test_id') + info = VideoInfo('__test_id__') '''live page :duration = 0''' assert info.get_duration() == 0 - assert info.video_id == 'test_id' + assert info.video_id == '__test_id__' assert info.get_channel_name() == 'BGM channel' assert info.get_thumbnail() == \ 'https://i.ytimg.com/vi/fEvM-OUbaKs/hqdefault_live.jpg' @@ -38,25 +41,26 @@ def test_live_page(mocker): ' - 24/7 Live Stream - Slow Jazz') assert info.get_channel_id() == 'UCQINXHZqCU5i06HzxRkujfg' + def test_invalid_video_id(mocker): '''Test case invalid video_id is specified.''' _set_test_data( 'tests/testdata/videoinfo/invalid_video_id_page.txt', mocker) try: - _ = VideoInfo('test_id') + _ = VideoInfo('__test_id__') assert False except InvalidVideoIdException: assert True + def test_no_info(mocker): '''Test case the video page has renderer, but no info.''' _set_test_data( 'tests/testdata/videoinfo/no_info_page.txt', mocker) - info = VideoInfo('test_id') - assert info.video_id == 'test_id' + info = VideoInfo('__test_id__') + assert info.video_id == '__test_id__' assert info.get_channel_name() is None assert info.get_thumbnail() is None assert info.get_title() is None assert info.get_channel_id() is None assert info.get_duration() is None -