Merge branch 'feature/emoji_embedding' into develop

This commit is contained in:
taizan-hokuto
2020-07-13 01:45:07 +09:00
8 changed files with 119 additions and 51 deletions

View File

@@ -27,7 +27,7 @@ pip install pytchat
### CLI ### CLI
One-liner command. One-liner command.
Save chat data to html. Save chat data to html, with embedded custom emojis.
```bash ```bash
$ pytchat -v ZJ6Q4U_Vg6s -o "c:/temp/" $ pytchat -v ZJ6Q4U_Vg6s -o "c:/temp/"
@@ -148,6 +148,20 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
main() main()
``` ```
### Extract archived chat data as [HTML](https://github.com/taizan-hokuto/pytchat/wiki/HTMLArchiver) or [tab separated values](https://github.com/taizan-hokuto/pytchat/wiki/TSVArchiver).
```python
from pytchat import HTMLArchiver, Extractor
video_id = "*******"
ex = Extractor(
video_id,
div=10,
processor=HTMLArchiver("c:/test.html")
)
ex.extract()
print("finished.")
```
## Structure of Default Processor ## Structure of Default Processor
Each item can be got with `items` function. Each item can be got with `items` function.
@@ -175,7 +189,7 @@ Each item can be got with `items` function.
<tr> <tr>
<td>messageEx</td> <td>messageEx</td>
<td>str</td> <td>str</td>
<td>list of message texts and emoji URLs.</td> <td>list of message texts and emoji dicts(id, txt, url).</td>
</tr> </tr>
<tr> <tr>
<td>timestamp</td> <td>timestamp</td>

View File

@@ -1,9 +1,9 @@
import logging
from . import mylogger from . import mylogger
headers = { headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'} 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
def logger(module_name: str, loglevel=None): def logger(module_name: str, loglevel=logging.DEBUG):
module_logger = mylogger.get_logger(module_name, loglevel=loglevel) module_logger = mylogger.get_logger(module_name, loglevel=loglevel)
return module_logger return module_logger

View File

@@ -325,7 +325,8 @@ class LiveChatAsync:
self._pauser.put_nowait(None) self._pauser.put_nowait(None)
self._is_alive = False self._is_alive = False
self._buffer.put_nowait({}) self._buffer.put_nowait({})
self.processor.finalize()
def _task_finished(self): def _task_finished(self):
''' '''
Listenerを終了する。 Listenerを終了する。

View File

@@ -316,6 +316,7 @@ class LiveChat:
self._is_alive = False self._is_alive = False
self._buffer.put({}) self._buffer.put({})
self._event.set() self._event.set()
self.processor.finalize()
def _task_finished(self): def _task_finished(self):
''' '''

View File

@@ -21,3 +21,10 @@ class ChatProcessor:
} }
''' '''
pass pass
def finalize(self, *args, **kwargs):
'''
Interface for finalizing the process.
Called when chat fetching finished.
'''
pass

View File

@@ -52,8 +52,11 @@ class BaseRenderer:
if r: if r:
if r.get('emoji'): if r.get('emoji'):
message += r['emoji'].get('shortcuts', [''])[0] message += r['emoji'].get('shortcuts', [''])[0]
message_ex.append( message_ex.append({
r['emoji']['image']['thumbnails'][1].get('url')) 'id': r['emoji'].get('emojiId').split('/')[-1],
'txt': r['emoji'].get('shortcuts', [''])[0],
'url': r['emoji']['image']['thumbnails'][0].get('url')
})
else: else:
message += r.get('text', '') message += r.get('text', '')
message_ex.append(r.get('text', '')) message_ex.append(r.get('text', ''))

View File

@@ -1,31 +1,55 @@
import os import os
import re import re
import requests
from base64 import standard_b64encode
from .chat_processor import ChatProcessor from .chat_processor import ChatProcessor
from .default.processor import DefaultProcessor from .default.processor import DefaultProcessor
PATTERN = re.compile(r"(.*)\(([0-9]+)\)$") PATTERN = re.compile(r"(.*)\(([0-9]+)\)$")
fmt_headers = ['datetime', 'elapsed', 'authorName', fmt_headers = ['datetime', 'elapsed', 'authorName',
'message', 'superchat', 'type', 'authorChannel'] 'message', 'superchat', 'type', 'authorChannel']
HEADER_HTML = ''' HEADER_HTML = '''
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8"> <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
<html>
<head>
'''
TABLE_CSS = '''
table.css {
border-collapse: collapse;
}
table.css thead{
border-collapse: collapse;
border: 1px solid #000
}
table.css tr td{
padding: 0.3em;
border: 1px solid #000
}
table.css th{
padding: 0.3em;
border: 1px solid #000
}
''' '''
class HTMLArchiver(ChatProcessor): class HTMLArchiver(ChatProcessor):
''' '''
HtmlArchiver saves chat data as HTML table format. HTMLArchiver saves chat data as HTML table format.
''' '''
def __init__(self, save_path): def __init__(self, save_path):
super().__init__() super().__init__()
self.save_path = self._checkpath(save_path) self.save_path = self._checkpath(save_path)
with open(self.save_path, mode='a', encoding='utf-8') as f:
f.write(HEADER_HTML)
f.write('<table border="1" style="border-collapse: collapse">')
f.writelines(self._parse_html_header(fmt_headers))
self.processor = DefaultProcessor() self.processor = DefaultProcessor()
self.emoji_table = {} # table for custom emojis. key: emoji_id, value: base64 encoded image binary.
self.header = [HEADER_HTML]
self.body = ['<body>\n', '<table class="css">\n', self._parse_table_header(fmt_headers)]
def _checkpath(self, filepath): def _checkpath(self, filepath):
splitter = os.path.splitext(os.path.basename(filepath)) splitter = os.path.splitext(os.path.basename(filepath))
@@ -56,42 +80,58 @@ class HTMLArchiver(ChatProcessor):
""" """
if chat_components is None or len(chat_components) == 0: if chat_components is None or len(chat_components) == 0:
return return
self.body.extend(
with open(self.save_path, mode='a', encoding='utf-8') as f: (self._parse_html_line((
chats = self.processor.process(chat_components).items c.datetime,
for c in chats: c.elapsedTime,
f.writelines( c.author.name,
self._parse_html_line([ self._parse_message(c.messageEx),
c.datetime, c.amountString,
c.elapsedTime, c.author.type,
c.author.name, c.author.channelId)
c.message, ) for c in self.processor.process(chat_components).items)
c.amountString, )
c.author.type,
c.author.channelId]
)
)
'''
#Palliative treatment#
Comment out below line to prevent the table
display from collapsing.
'''
# f.write('</table>')
def _parse_html_line(self, raw_line): def _parse_html_line(self, raw_line):
html = '' return ''.join(('<tr>',
html += ' <tr>' ''.join(''.join(('<td>', cell, '</td>')) for cell in raw_line),
for cell in raw_line: '</tr>\n'))
html += '<td>' + cell + '</td>'
html += '</tr>\n'
return html
def _parse_html_header(self, raw_line): def _parse_table_header(self, raw_line):
html = '' return ''.join(('<thead><tr>',
html += '<thead>\n' ''.join(''.join(('<th>', cell, '</th>')) for cell in raw_line),
html += ' <tr>' '</tr></thead>\n'))
for cell in raw_line:
html += '<th>' + cell + '</th>' def _parse_message(self, message_items: list) -> str:
html += '</tr>\n' return ''.join(''.join(('<span class="', self._set_emoji_table(item), '" title="', item['txt'], '"></span>'))
html += '</thead>\n' if type(item) is dict else item
return html for item in message_items)
def _encode_img(self, url):
resp = requests.get(url)
return standard_b64encode(resp.content).decode()
def _set_emoji_table(self, item: dict):
emoji_id = item['id']
if emoji_id not in self.emoji_table:
self.emoji_table.setdefault(emoji_id, self._encode_img(item['url']))
return emoji_id
def _stylecode(self, name, code, width, height):
return ''.join((".", name, " { display: inline-block; background-image: url(data:image/png;base64,",
code, "); background-repeat: no-repeat; width: ",
str(width), "; height: ", str(height), ";}"))
def _create_styles(self):
return '\n'.join(('<style type="text/css">',
TABLE_CSS,
'\n'.join(self._stylecode(key, self.emoji_table[key], 24, 24)
for key in self.emoji_table.keys()),
'</style>\n'))
def finalize(self):
self.header.extend([self._create_styles(), '</head>\n'])
self.body.extend(['</table>\n</body>'])
with open(self.save_path, mode='a', encoding='utf-8') as f:
f.writelines(self.header)
f.writelines(self.body)

View File

@@ -83,11 +83,13 @@ class Extractor:
data = self._execute_extract_operations() data = self._execute_extract_operations()
if self.processor is None: if self.processor is None:
return data return data
return self.processor.process( ret = self.processor.process(
[{'video_id': None, [{'video_id': None,
'timeout': 1, 'timeout': 1,
'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}] 'chatdata': (action["replayChatItemAction"]["actions"][0] for action in data)}]
) )
self.processor.finalize()
return ret
def cancel(self): def cancel(self):
asyncdl.cancel() asyncdl.cancel()