Initial commit

`parser.py` is used for parsing Hangouts log files
This commit is contained in:
Jacob Manning 2018-07-01 10:22:29 -04:00
commit 0997e44613
4 changed files with 327 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.swp
__pycache__

269
parser.py Normal file
View file

@ -0,0 +1,269 @@
#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import json
import pickle
import argparse
import utils
from utils import LOG_ERROR, LOG_DEBUG, LOG_INFO, LOG_WARN
# Format string for participant summaries
conversation_participant_summary = '''
Name: {}
ID: {}
Regular chat messages: {}
Rename conversation messages: {}
Remove user messages: {}
Add user messages: {}
Hangouts event messages: {}\n
'''
class ConversationParticipant(object):
'''
Helper class representing a participant in a Hangouts conversation. Right
now it only tracks the id, name, and number of messages of each type that
the person sent
'''
def __init__(self, id, name='unknown'):
self.id = id
self.name = name
self.num_messages = {
'regular_chat': 0,
'rename_conversation': 0,
'remove_user': 0,
'add_user': 0,
'hangout_event': 0
}
self.total_message_count = 0
def add_regular_chat_message(self):
self.num_messages['regular_chat'] += 1
def add_rename_conversation_message(self):
self.num_messages['rename_conversation'] += 1
def add_remove_user_message(self):
self.num_messages['remove_user'] += 1
def add_add_user_message(self):
self.num_messages['add_user'] += 1
def add_hangout_event_message(self):
self.num_messages['hangout_event'] += 1
def update_total_message_count(self):
self.total_message_count = sum(self.num_messages.values())
def get_total_message_count(self):
self.update_total_message_count()
return self.total_message_count
def get_name_or_id(self):
if self.name != 'unknown':
return self.name
return self.id
def get_summary(self):
return conversation_participant_summary.format(
self.name, self.id,
self.num_messages['regular_chat'],
self.num_messages['rename_conversation'],
self.num_messages['remove_user'],
self.num_messages['add_user'],
self.num_messages['hangout_event'])
class Conversation(object):
'''
A Hangouts Conversation. Tracks the participants and the names of the
conversation over time.
'''
def __init__(self, state):
self.state = state['conversation_state']
self.event = self.state['event']
self.conversation_id = self.state['conversation_id']['id']
self.type = self.state['conversation']['type']
if self.type == 'GROUP':
if 'name' in self.state['conversation']:
self.conversation_name = self.state['conversation']['name']
else:
self.conversation_name = 'Unknown Group Message'
else:
self.conversation_name = 'Direct Message'
self.participants = []
self.conversation_names = []
self.message_count = 0
self.hangouts_duration_s = 0
self.message_data = []
def add_participant(self, id, name='unknown'):
new_participant = ConversationParticipant(id, name)
self.participants.append(new_participant)
def is_participant(self, id):
return self.get_participant(id) is not None
def get_participant(self, id):
'''
Yes, this function is naive. But, I am making the assumption that
most conversations being parsed will have a small number of
participants such that a more elegant method would be unnecessary
'''
# Iterate through all conversation participants
for p in self.participants:
# Check if target id matches participant id
if p.id == id:
return p
return None
def get_total_message_count(self):
self.update_message_count()
return self.message_count
def get_hangout_duration_h(self):
return self.hangouts_duration_s / 3600
def update_message_count(self):
self.message_count = sum(
[ p.get_total_message_count() for p in self.participants ])
def parse_initial_participants(self):
for p in self.state['conversation']['participant_data']:
id = p['id']['chat_id']
if 'fallback_name' in p:
name = p['fallback_name']
else:
name = 'unknown'
self.add_participant(id, name)
def parse_message(self, msg):
sender_id = msg['sender_id']['chat_id']
msg_type = msg['event_type']
timestamp = int(msg['timestamp']) / 1000000
participant = self.get_participant(sender_id)
if participant is None:
LOG_DEBUG('Parsing message for unkown participant')
self.message_data.append([timestamp, msg_type, sender_id])
return
self.message_data.append([timestamp, msg_type, participant.get_name_or_id()])
if msg_type == 'REGULAR_CHAT_MESSAGE':
participant.add_regular_chat_message()
elif msg_type == 'RENAME_CONVERSATION':
participant.add_rename_conversation_message()
self.conversation_names.append(msg['conversation_rename']['new_name'])
elif msg_type == 'REMOVE_USER':
participant.add_remove_user_message()
elif msg_type == 'ADD_USER':
participant.add_add_user_message()
for id in msg['membership_change']['participant_id']:
if not self.is_participant(id['chat_id']):
self.add_participant(id['chat_id'])
elif msg_type == 'HANGOUT_EVENT':
participant.add_hangout_event_message()
event = msg['hangout_event']
event_type = event['event_type']
if event_type == 'END_HANGOUT':
self.hangouts_duration_s += int(event['hangout_duration_secs'])
else:
LOG_DEBUG('Trying to parse unknown message type: {}'.format(msg_type))
def parse(self):
LOG_INFO('Parsing conversation with ID: {}'.format(self.conversation_id))
self.parse_initial_participants()
for e in self.event:
self.parse_message(e)
def print_summary(self):
print('Conversation ID: {}'.format(self.conversation_id))
if self.type == 'GROUP':
print('Current conversation name: {}'.format(self.conversation_name))
print('Other conversation names: {}'.format(', '.join(self.conversation_names)))
print('Total message count: {}'.format(self.get_total_message_count()))
print('Time in video call (hours): {}'.format(self.get_hangout_duration_h()))
print('Conversation participants:')
for p in self.participants:
print(p.get_summary())
def serialize(self, filename=None, prefix='output'):
hangouts_data = {
'conversation_id': self.conversation_id,
'conversation_name': self.conversation_name,
'other_conversation_names': self.conversation_names,
'message_count': self.get_total_message_count(),
'video_duration': self.get_hangout_duration_h(),
'participant_ids': [ p.id for p in self.participants ],
'participant_names': [ p.get_name_or_id() for p in self.participants ],
'messages': self.message_data
}
output_dir = os.path.join(os.getcwd(), prefix)
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
if filename is None:
filename = '{}-parsed.pkl'.format(self.conversation_id)
output_filename = os.path.join(output_dir, filename)
LOG_INFO('Serializing conversation data to "{}"'.format(output_filename))
# Serialize the conversation and dump it to file
with open(output_filename, 'wb') as f:
pickle.dump(hangouts_data, f)
def main(file_path):
# Validate raw data path
if not os.path.exists(file_path):
LOG_ERROR('Could not find file: {}'.format(file_path))
return
# Validate raw data file type
if not file_path.endswith('.json'):
LOG_ERROR('File path must be a json file')
return
with open(file_path, encoding='utf-8') as f:
LOG_INFO('Parsing JSON file: {}'.format(file_path))
json_archive = json.load(f)
for state in json_archive['conversation_state']:
conv = Conversation(state)
conv.parse()
conv.print_summary()
conv.serialize()
LOG_INFO('Finished parsing conversations!')
if __name__ == "__main__":
LOG_INFO('Started script')
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file-path', default='raw/Hangouts.json',
type=str, dest='file_path', help='Path to raw data file')
parser.add_argument('-l', '--log-level', default=1,
type=int, dest='log_level', help='Minimum logging level to output')
args = parser.parse_args()
utils.set_log_level(args.log_level)
main(args.file_path)

2
raw/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*
!.gitignore

54
utils.py Normal file
View file

@ -0,0 +1,54 @@
'''
Simple logging helper functions
'''
_UTILS_MIN_LOG_LEVEL_PRIVATE = 0
_UTILS_LOG_LEVELS_PRIVATE = {
'DEBUG': 0,
'INFO': 1,
'WARN': 2,
'ERROR': 3,
}
def set_log_level(level):
'''
Sets the minimum log level that should be output
'''
global _UTILS_MIN_LOG_LEVEL_PRIVATE
global _UTILS_LOG_LEVELS_PRIVATE
if level in _UTILS_LOG_LEVELS_PRIVATE.values():
_UTILS_MIN_LOG_LEVEL_PRIVATE = level
else:
LOG_ERROR('utils.py: Requested log level unsupported')
def LOG(level, msg):
'''
Backbone logging function
'''
global _UTILS_MIN_LOG_LEVEL_PRIVATE
global _UTILS_LOG_LEVELS_PRIVATE
# Check if level is one of the defaults
if level in _UTILS_LOG_LEVELS_PRIVATE.keys():
# Check if the requested log level is greater than or equal to
# the minimum logging level
if _UTILS_LOG_LEVELS_PRIVATE[level] >= _UTILS_MIN_LOG_LEVEL_PRIVATE:
print('[', level, ']: ', msg)
else:
# If the level is not a default level, always print it
print('[', level, ']: ', msg)
def LOG_ERROR(msg):
LOG('ERROR', msg)
def LOG_WARN(msg):
LOG('WARN', msg)
def LOG_INFO(msg):
LOG('INFO', msg)
def LOG_DEBUG(msg):
LOG('DEBUG', msg)