Divide modules

This commit is contained in:
taizan-hokuto
2020-02-01 21:08:27 +09:00
parent 228773295d
commit 04aedc82e8
4 changed files with 171 additions and 150 deletions

28
pytchat/tool/dlworker.py Normal file
View File

@@ -0,0 +1,28 @@
from . import parser
class DownloadWorker:
def __init__(self, dl, block, blocklist):
self.block = block
self.blocklist = blocklist
self.dl = dl
async def run(self,session):
temp_last = self.block.temp_last
self.block.chat_data, continuation = self.cut(
self.block.chat_data,
self.block.continuation,
self.block.last,
temp_last )
while continuation:
data, cont, fetched_last = await self.dl(continuation, session)
data, continuation = self.cut(data, cont, fetched_last, temp_last)
self.block.chat_data.extend(data)
def cut(self, data, cont, fetched_last, temp_last):
if fetched_last < temp_last or temp_last == -1:
return data, cont
for i, line in enumerate(data):
line_offset = parser.get_offset(line)
if line_offset >= temp_last:
self.block.last = line_offset
return data[:i], None

View File

@@ -3,13 +3,15 @@ import aiohttp
import json import json
import traceback import traceback
from urllib.parse import quote from urllib.parse import quote
from . import parser from . import parser
from . import videoinfo from . import videoinfo
from . dlworker import DownloadWorker
from . duplcheck import duplicate_head, duplicate_tail, overwrap
from .. import config from .. import config
from .. import util from .. import util
from .. paramgen import arcparam from .. paramgen import arcparam
from ..exceptions import InvalidVideoIdException from ..exceptions import InvalidVideoIdException
logger = config.logger(__name__) logger = config.logger(__name__)
headers=config.headers headers=config.headers
@@ -91,27 +93,7 @@ class Downloader:
return self return self
def remove_duplicate_head(self): def remove_duplicate_head(self):
blocks = self.blocks self.blocks = duplicate_head(self.blocks)
def is_same_offset(index):
return (blocks[index].first == blocks[index+1].first)
def is_same_id(index):
id_0 = parser.get_id(blocks[index].chat_data[0])
id_1 = parser.get_id(blocks[index+1].chat_data[0])
return (id_0 == id_1)
def is_same_type(index):
type_0 = parser.get_type(blocks[index].chat_data[0])
type_1 = parser.get_type(blocks[index+1].chat_data[0])
return (type_0 == type_1)
ret = []
[ret.append(blocks[i]) for i in range(len(blocks)-1)
if (len(blocks[i].chat_data)>0 and
not ( is_same_offset(i) and is_same_id(i) and is_same_type(i)))]
ret.append(blocks[-1])
self.blocks = ret
return self return self
def set_temporary_last(self): def set_temporary_last(self):
@@ -121,118 +103,49 @@ class Downloader:
return self return self
def remove_overwrap(self): def remove_overwrap(self):
blocks = self.blocks self.blocks = overwrap(self.blocks)
if len(blocks) == 1 : return self
ret = []
a = 0
b = 1
jmp = False
ret.append(blocks[0])
while a < len(blocks)-2:
while blocks[a].last > blocks[b].first:
b+=1
if b == len(blocks)-1:
jmp = True
break
if jmp: break
if b-a == 1:
a = b
else:
a = b-1
ret.append(blocks[a])
b = a+1
ret.append(blocks[-1])
self.blocks = ret
return self return self
def download_blocks(self): def download_blocks(self):
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(self._dl_allocate()) loop.run_until_complete(self._dl_distribute())
return self return self
async def _dl_allocate(self): async def _dl_distribute(self):
tasks = [] workers = [
DownloadWorker(
dl=self.dl_func,
block = block,
blocklist= self.blocks
)
for pos,block in enumerate(self.blocks)
]
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
tasks = [self._dl_task(session, block) for block in self.blocks] tasks = [worker.run(session) for worker in workers]
return await asyncio.gather(*tasks,return_exceptions=True) return await asyncio.gather(*tasks,return_exceptions=True)
async def _dl_task(self, session, block:Block): async def dl_func(self,continuation,session):
if (block.temp_last != -1 and url = f"{REPLAY_URL}{quote(continuation)}&pbj=1"
block.last > block.temp_last): async with session.get(url,headers = config.headers) as resp:
return text = await resp.text()
continuation = block.continuation continuation, actions = parser.parse(json.loads(text))
while continuation: if actions:
url = f"{REPLAY_URL}{quote(continuation)}&pbj=1" last = parser.get_offset(actions[-1])
async with session.get(url,headers = config.headers) as resp: first = parser.get_offset(actions[0])
text = await resp.text() if self.callback:
continuation, actions = parser.parse(json.loads(text)) self.callback(actions,last-first)
if actions: return actions,continuation,last
block.chat_data.extend(actions) return continuation, [], None
last = parser.get_offset(actions[-1])
first = parser.get_offset(actions[0])
if self.callback:
self.callback(actions,last-first)
if block.temp_last != -1:
if last > block.temp_last:
block.last = last
break
else:
block.last = last
def remove_duplicate_tail(self): def remove_duplicate_tail(self):
blocks = self.blocks self.blocks = duplicate_tail(self.blocks)
if len(blocks) == 1 : return self
def is_same_offset(index):
return (blocks[index-1].last == blocks[index].last)
def is_same_id(index):
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
return (id_0 == id_1)
def is_same_type(index):
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
type_1 = parser.get_type(blocks[index].chat_data[-1])
return (type_0 == type_1)
ret = []
ret.append(blocks[0])
[ret.append(blocks[i]) for i in range(1,len(blocks)-1)
if not ( is_same_offset(i) and is_same_id(i) and is_same_type(i) )]
ret.append(self.blocks[-1])
self.blocks = ret
return self return self
def combine(self): def combine(self):
line = '' ret = []
try: for block in self.blocks:
if len(self.blocks[0].chat_data)>0: ret.extend(block.chat_data)
lastline=self.blocks[0].chat_data[-1] return ret
lastline_offset = parser.get_offset(lastline)
else: return None
for i in range(1,len(self.blocks)):
f=self.blocks[i].chat_data
if len(f)==0:
logger.error(f'zero size piece.:{str(i)}')
continue
for row in range(len(f)):
line = f[row]
if parser.get_offset(line) > lastline_offset:
self.blocks[0].chat_data.extend(f[row:])
break
else:
logger.error(
f'Missing connection.: pos:{str(i-1)}->{str(i)}'
f' lastline_offset= {lastline_offset}')
lastline_offset = parser.get_offset( f[-1])
return self.blocks[0].chat_data
except Exception as e:
logger.error(f"{type(e)} {str(e)} {line}")
traceback.print_exc()
def download(self): def download(self):
return ( return (
@@ -245,28 +158,6 @@ class Downloader:
.combine() .combine()
) )
def check_duplicate(blocks):
def is_same_offset(index):
offset_0 = parser.get_offset(blocks[index])
offset_1 = parser.get_offset(blocks[index+1])
return (offset_0 == offset_1)
def is_same_id(index):
id_0 = parser.get_id(blocks[index])
id_1 = parser.get_id(blocks[index+1])
return (id_0 == id_1)
def is_same_type(index):
type_0 = parser.get_type(blocks[index])
type_1 = parser.get_type(blocks[index+1])
return (type_0 == type_1)
ret =[]
for i in range(len(blocks)-1):
if ( is_same_offset(i) and is_same_id(i) and is_same_type(i) ):
ret.append(blocks[i])
return ret
def download(video_id, div = 20, callback=None, processor = None): def download(video_id, div = 20, callback=None, processor = None):
duration = 0 duration = 0

105
pytchat/tool/duplcheck.py Normal file
View File

@@ -0,0 +1,105 @@
from . import parser
def check_duplicate(chatdata):
max_range = len(chatdata)-1
tbl_offset = [None] * max_range
tbl_id =[None] * max_range
tbl_type=[None] * max_range
def create_table(chatdata,max_range):
for i in range(max_range):
tbl_offset[i] = parser.get_offset(chatdata[i])
tbl_id[i] = parser.get_id(chatdata[i])
tbl_type[i] = parser.get_type(chatdata[i])
def is_duplicate(i,j):
return (
tbl_offset[i] == tbl_offset[j]
and
tbl_id[i] == tbl_id[j]
and
tbl_type[i] == tbl_type[j]
)
print("creating table...")
create_table(chatdata,max_range)
print("searching duplicate data...")
return [{ "i":{
"index" : i, "id" : parser.get_id(chatdata[i]),
"offsetTime" : parser.get_offset(chatdata[i])
},
"j":{
"index" : j, "id" : parser.get_id(chatdata[j]),
"offsetTime" : parser.get_offset(chatdata[j])
}
}
for i in range(max_range) for j in range(i+1,max_range)
if is_duplicate(i,j)]
def duplicate_head(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_head(index):
id_0 = parser.get_id(blocks[index].chat_data[0])
id_1 = parser.get_id(blocks[index+1].chat_data[0])
type_0 = parser.get_type(blocks[index].chat_data[0])
type_1 = parser.get_type(blocks[index+1].chat_data[0])
return (
blocks[index].first == blocks[index+1].first
and
id_0 == id_1
and
type_0 == type_1
)
ret = [blocks[i] for i in range(len(blocks)-1)
if (len(blocks[i].chat_data)>0 and
not is_duplicate_head(i) )]
ret.append(blocks[-1])
return ret
def duplicate_tail(blocks):
if len(blocks) == 1 : return blocks
def is_duplicate_tail(index):
id_0 = parser.get_id(blocks[index-1].chat_data[-1])
id_1 = parser.get_id(blocks[index].chat_data[-1])
type_0 = parser.get_type(blocks[index-1].chat_data[-1])
type_1 = parser.get_type(blocks[index].chat_data[-1])
return (
blocks[index-1].last == blocks[index].last
and
id_0 == id_1
and
type_0 == type_1
)
ret = [blocks[i] for i in range(0,len(blocks)-1)
if i == 0 or not is_duplicate_tail(i) ]
ret.append(blocks[-1])
return ret
def overwrap(blocks):
if len(blocks) == 1 : return blocks
ret = []
a = 0
b = 1
jmp = False
ret.append(blocks[0])
while a < len(blocks)-2:
while blocks[a].last > blocks[b].first:
b+=1
if b == len(blocks)-1:
jmp = True
break
if jmp: break
if b-a == 1:
a = b
else:
a = b-1
ret.append(blocks[a])
b = a+1
ret.append(blocks[-1])
return ret

View File

@@ -5,7 +5,6 @@ from .. exceptions import (
NoContentsException, NoContentsException,
NoContinuationsException ) NoContinuationsException )
logger = config.logger(__name__) logger = config.logger(__name__)
def parse(jsn): def parse(jsn):
@@ -27,7 +26,6 @@ def parse(jsn):
raise ResponseContextError('動画に接続できません。' raise ResponseContextError('動画に接続できません。'
'動画IDが間違っているか、動画が削除非公開の可能性があります。') '動画IDが間違っているか、動画が削除非公開の可能性があります。')
contents=jsn['response'].get('continuationContents') contents=jsn['response'].get('continuationContents')
#配信が終了した場合、もしくはチャットデータが取得できない場合
if contents is None: if contents is None:
raise NoContentsException('チャットデータを取得できませんでした。') raise NoContentsException('チャットデータを取得できませんでした。')
@@ -41,17 +39,16 @@ def parse(jsn):
return continuation, actions return continuation, actions
return None, [] return None, []
# if actions is None:
# return {"continuation":None,"chatdata":[]}
def get_offset(item): def get_offset(item):
return int(item['replayChatItemAction']["videoOffsetTimeMsec"]) return int(item['replayChatItemAction']["videoOffsetTimeMsec"])
def get_id(item): def get_id(item):
return list((list(item['replayChatItemAction']["actions"][0].values())[0])['item'].values())[0].get('id') return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].values())[0].get('id')
def get_type(item): def get_type(item):
return list((list(item['replayChatItemAction']["actions"][0].values())[0])['item'].keys())[0] return list((list(item['replayChatItemAction']["actions"][0].values()
)[0])['item'].keys())[0]