Use httpx

2020-08-30 22:16:58 +09:00
parent 8012e1d191
commit 95f975c93d
20 changed files with 259 additions and 354 deletions
--- a/pytchat/tool/extract/asyncdl.py
+++ b/pytchat/tool/extract/asyncdl.py
@@ -1,6 +1,5 @@
-import aiohttp
+import httpx
 import asyncio
-import json
 from . import parser
 from . block import Block
 from . worker import ExtractWorker
@@ -55,7 +54,7 @@ def ready_blocks(video_id, duration, div, callback):
        raise ValueError

    async def _get_blocks(video_id, duration, div, callback):
-        async with aiohttp.ClientSession() as session:
+        async with httpx.AsyncClient(http2=True) as session:
            tasks = [_create_block(session, video_id, seektime, callback)
                     for seektime in _split(-1, duration, div)]
            return await asyncio.gather(*tasks)
@@ -65,9 +64,8 @@ def ready_blocks(video_id, duration, div, callback):
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
        for _ in range(MAX_RETRY_COUNT):
            try:
-                async with session.get(url, headers=headers) as resp:
-                    text = await resp.text()
-                next_continuation, actions = parser.parse(json.loads(text))
+                resp = await session.get(url, headers=headers)
+                next_continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
@@ -106,7 +104,7 @@ def fetch_patch(callback, blocks, video_id):
            )
            for block in blocks
        ]
-        async with aiohttp.ClientSession() as session:
+        async with httpx.AsyncClient() as session:
            tasks = [worker.run(session) for worker in workers]
            return await asyncio.gather(*tasks)

@@ -114,9 +112,8 @@ def fetch_patch(callback, blocks, video_id):
        url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
        for _ in range(MAX_RETRY_COUNT):
            try:
-                async with session.get(url, headers=config.headers) as resp:
-                    chat_json = await resp.text()
-                continuation, actions = parser.parse(json.loads(chat_json))
+                resp = await session.get(url, headers=config.headers)
+                continuation, actions = parser.parse(resp.json())
                break
            except JSONDecodeError:
                await asyncio.sleep(3)
--- a/pytchat/tool/extract/worker.py
+++ b/pytchat/tool/extract/worker.py
@@ -1,6 +1,7 @@
 from . block import Block
 from . patch import fill, split
 from ... paramgen import arcparam
+from typing import Tuple


 class ExtractWorker:
@@ -76,7 +77,7 @@ def _search_new_block(worker) -> Block:
    return new_block


-def _get_undone_block(blocks) -> (int, Block):
+def _get_undone_block(blocks) -> Tuple[int, Block]:
    min_interval_ms = 120000
    max_remaining = 0
    undone_block = None
--- a/pytchat/tool/mining/asyncdl.py
+++ b/pytchat/tool/mining/asyncdl.py
@@ -1,12 +1,12 @@

-import aiohttp
+import httpx
 import asyncio
 import json
 from . import parser
 from . block import Block
 from . worker import ExtractWorker
 from . patch import Patch
-from ... import config 
+from ... import config
 from ... paramgen import arcparam_mining as arcparam
 from concurrent.futures import CancelledError
 from urllib.parse import quote
@@ -14,10 +14,12 @@ from urllib.parse import quote
 headers = config.headers
 REPLAY_URL = "https://www.youtube.com/live_chat_replay?continuation="
 INTERVAL = 1
-def _split(start, end, count, min_interval_sec = 120):
+
+
+def _split(start, end, count, min_interval_sec=120):
    """
    Split section from `start` to `end` into `count` pieces,
-    and returns the beginning of each piece. 
+    and returns the beginning of each piece.
    The `count` is adjusted so that the length of each piece
    is no smaller than `min_interval`.

@@ -25,42 +27,43 @@ def _split(start, end, count, min_interval_sec = 120):
    --------
        List of the offset of each block's first chat data.
    """
-    
-    if not (isinstance(start,int) or isinstance(start,float)) or \
-       not (isinstance(end,int) or isinstance(end,float)):
+
+    if not (isinstance(start, int) or isinstance(start, float)) or \
+       not (isinstance(end, int) or isinstance(end, float)):
        raise ValueError("start/end must be int or float")
-    if not isinstance(count,int):
+    if not isinstance(count, int):
        raise ValueError("count must be int")
-    if start>end:
+    if start > end:
        raise ValueError("end must be equal to or greater than start.")
-    if count<1:
+    if count < 1:
        raise ValueError("count must be equal to or greater than 1.")
-    if (end-start)/count < min_interval_sec:
-        count = int((end-start)/min_interval_sec) 
-        if count == 0 : count = 1
-    interval= (end-start)/count 
-    
+    if (end - start) / count < min_interval_sec:
+        count = int((end - start) / min_interval_sec)
+        if count == 0:
+            count = 1
+    interval = (end - start) / count
+
    if count == 1:
        return [start]
-    return sorted( list(set( [int(start + interval*j)
-        for j in range(count) ])))
+    return sorted(list(set([int(start + interval * j)
+                            for j in range(count)])))
+

 def ready_blocks(video_id, duration, div, callback):
-    if div <= 0: raise ValueError
+    if div <= 0:
+        raise ValueError

-    async def _get_blocks( video_id, duration, div, callback):
-        async with aiohttp.ClientSession() as session:
-            tasks = [_create_block(session, video_id,  seektime, callback)
-                for  seektime in _split(0, duration, div)]
+    async def _get_blocks(video_id, duration, div, callback):
+        async with httpx.ClientSession() as session:
+            tasks = [_create_block(session, video_id, seektime, callback)
+                     for seektime in _split(0, duration, div)]
            return await asyncio.gather(*tasks)

-    
-
    async def _create_block(session, video_id, seektime, callback):
-        continuation = arcparam.getparam(video_id, seektime = seektime)
-        url=(f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
-            f"{int(seektime*1000)}&hidden=false&pbj=1")
-        async with session.get(url, headers = headers) as resp:
+        continuation = arcparam.getparam(video_id, seektime=seektime)
+        url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
+               f"{int(seektime*1000)}&hidden=false&pbj=1")
+        async with session.get(url, headers=headers) as resp:
            chat_json = await resp.text()
        if chat_json is None:
            return
@@ -70,39 +73,40 @@ def ready_blocks(video_id, duration, div, callback):
        if callback:
            callback(actions, INTERVAL)
        return Block(
-            continuation = continuation,
-            chat_data = actions,
-            first = first,
-            last = seektime,
-            seektime = seektime
+            continuation=continuation,
+            chat_data=actions,
+            first=first,
+            last=seektime,
+            seektime=seektime
        )
    """
    fetch initial blocks.
-    """  
+    """
    loop = asyncio.get_event_loop()
    blocks = loop.run_until_complete(
        _get_blocks(video_id, duration, div, callback))
    return blocks

+
 def fetch_patch(callback, blocks, video_id):

    async def _allocate_workers():
        workers = [
            ExtractWorker(
-                fetch = _fetch,  block = block,
-                blocks = blocks, video_id = video_id
+                fetch=_fetch, block=block,
+                blocks=blocks, video_id=video_id
            )
            for block in blocks
        ]
-        async with aiohttp.ClientSession() as session:
+        async with httpx.ClientSession() as session:
            tasks = [worker.run(session) for worker in workers]
-            return await asyncio.gather(*tasks)    
+            return await asyncio.gather(*tasks)

-    async def _fetch(seektime,session) -> Patch:
-        continuation = arcparam.getparam(video_id, seektime = seektime)
-        url=(f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
-            f"{int(seektime*1000)}&hidden=false&pbj=1")
-        async with session.get(url,headers = config.headers) as resp:
+    async def _fetch(seektime, session) -> Patch:
+        continuation = arcparam.getparam(video_id, seektime=seektime)
+        url = (f"{REPLAY_URL}{quote(continuation)}&playerOffsetMs="
+               f"{int(seektime*1000)}&hidden=false&pbj=1")
+        async with session.get(url, headers=config.headers) as resp:
            chat_json = await resp.text()
        actions = []
        try:
@@ -113,21 +117,22 @@ def fetch_patch(callback, blocks, video_id):
            pass
        if callback:
            callback(actions, INTERVAL)
-        return Patch(chats = actions, continuation = continuation, 
-            seektime = seektime, last = seektime)
+        return Patch(chats=actions, continuation=continuation,
+                     seektime=seektime, last=seektime)
    """
    allocate workers and assign blocks.
-    """   
+    """
    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(_allocate_workers())
    except CancelledError:
        pass

+
 async def _shutdown():
    print("\nshutdown...")
    tasks = [t for t in asyncio.all_tasks()
-         if t is not asyncio.current_task()]
+             if t is not asyncio.current_task()]
    for task in tasks:
        task.cancel()
        try:
@@ -135,7 +140,7 @@ async def _shutdown():
        except asyncio.CancelledError:
            pass

+
 def cancel():
    loop = asyncio.get_event_loop()
    loop.create_task(_shutdown())
-    
--- a/pytchat/tool/videoinfo.py
+++ b/pytchat/tool/videoinfo.py
@@ -1,6 +1,6 @@
 import json
 import re
-import requests
+import httpx
 from .. import config
 from ..exceptions import InvalidVideoIdException
 from ..util.extract_video_id import extract_video_id
@@ -85,7 +85,7 @@ class VideoInfo:

    def _get_page_text(self, video_id):
        url = f"https://www.youtube.com/embed/{video_id}"
-        resp = requests.get(url, headers=headers)
+        resp = httpx.get(url, headers=headers)
        resp.raise_for_status()
        return resp.text