#!/usr/bin/env python from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import json import pickle import argparse import utils from utils import LOG_ERROR, LOG_DEBUG, LOG_INFO, LOG_WARN # Format string for participant summaries conversation_participant_summary = ''' Name: {} ID: {} Regular chat messages: {} Rename conversation messages: {} Remove user messages: {} Add user messages: {} Hangouts event messages: {}\n ''' class ConversationParticipant(object): ''' Helper class representing a participant in a Hangouts conversation. Right now it only tracks the id, name, and number of messages of each type that the person sent ''' def __init__(self, id, name='unknown'): self.id = id self.name = name self.num_messages = { 'REGULAR_CHAT_MESSAGE': 0, 'RENAME_CONVERSATION': 0, 'REMOVE_USER': 0, 'ADD_USER': 0, 'HANGOUT_EVENT': 0 } self.total_message_count = 0 def add_message(self, msg_type): if msg_type not in self.num_messages.keys(): LOG_DEBUG('Trying to parse unknown message type: {}'.format(msg_type)) return else: self.num_messages[msg_type] += 1 def update_total_message_count(self): self.total_message_count = sum(self.num_messages.values()) def get_total_message_count(self): self.update_total_message_count() return self.total_message_count def get_name_or_id(self): return self.name if self.name != 'unknown' else self.id def get_summary(self): return conversation_participant_summary.format( self.name, self.id, self.num_messages['REGULAR_CHAT_MESSAGE'], self.num_messages['RENAME_CONVERSATION'], self.num_messages['REMOVE_USER'], self.num_messages['ADD_USER'], self.num_messages['HANGOUT_EVENT']) class Conversation(object): ''' A Hangouts Conversation. Tracks the participants and the names of the conversation over time. ''' def __init__(self, state): self.state = state['conversation_state'] self.event = self.state['event'] self.conversation_id = self.state['conversation_id']['id'] self.type = self.state['conversation']['type'] if self.type == 'GROUP': if 'name' in self.state['conversation']: self.conversation_name = self.state['conversation']['name'] else: self.conversation_name = 'Unknown Group Message' else: self.conversation_name = 'Direct Message' self.participants = [] self.conversation_names = [] self.message_count = 0 self.hangouts_duration_s = 0 self.message_data = [] def add_participant(self, id, name='unknown'): new_participant = ConversationParticipant(id, name) self.participants.append(new_participant) def is_participant(self, id): return self.get_participant(id) is not None def get_participant(self, id): ''' Yes, this function is naive. But, I am making the assumption that most conversations being parsed will have a small number of participants such that a more elegant method would be unnecessary ''' # Iterate through all conversation participants for p in self.participants: # Check if target id matches participant id if p.id == id: return p return None def get_total_message_count(self): self.update_message_count() return self.message_count def get_hangout_duration_h(self): return self.hangouts_duration_s / 3600 def update_message_count(self): self.message_count = sum( [ p.get_total_message_count() for p in self.participants ]) def parse_initial_participants(self): for p in self.state['conversation']['participant_data']: id = p['id']['chat_id'] if 'fallback_name' in p: name = p['fallback_name'] else: # Default name name = 'unknown' self.add_participant(id, name) def parse_message(self, msg): # Extract details from message sender_id = msg['sender_id']['chat_id'] msg_type = msg['event_type'] timestamp = int(msg['timestamp']) / 1000000 # Check if conversation participant is known participant = self.get_participant(sender_id) if participant is None: LOG_DEBUG('Parsing message for unkown participant') self.message_data.append([timestamp, msg_type, sender_id]) return # Add message to database self.message_data.append([timestamp, msg_type, participant.get_name_or_id()]) participant.add_message(msg_type) # Perform message-type-based actions if msg_type == 'RENAME_CONVERSATION': self.conversation_names.append(msg['conversation_rename']['new_name']) elif msg_type == 'ADD_USER': for id in msg['membership_change']['participant_id']: if not self.is_participant(id['chat_id']): self.add_participant(id['chat_id']) elif msg_type == 'HANGOUT_EVENT': event = msg['hangout_event'] event_type = event['event_type'] if event_type == 'END_HANGOUT': self.hangouts_duration_s += int(event['hangout_duration_secs']) def parse(self): LOG_INFO('Parsing conversation with ID: {}'.format(self.conversation_id)) self.parse_initial_participants() for e in self.event: self.parse_message(e) def print_summary(self): print('Conversation ID: {}'.format(self.conversation_id)) if self.type == 'GROUP': print('Current conversation name: {}'.format(self.conversation_name)) print('Other conversation names: {}'.format(', '.join(self.conversation_names))) print('Total message count: {}'.format(self.get_total_message_count())) print('Time in video call (hours): {}'.format(self.get_hangout_duration_h())) print('Conversation participants:') for p in self.participants: print(p.get_summary()) def serialize(self, filename=None, prefix='output'): # Assemble all data from conversation hangouts_data = { 'conversation_id': self.conversation_id, 'conversation_name': self.conversation_name, 'other_conversation_names': self.conversation_names, 'message_count': self.get_total_message_count(), 'video_duration': self.get_hangout_duration_h(), 'participant_ids': [ p.id for p in self.participants ], 'participant_names': [ p.get_name_or_id() for p in self.participants ], 'messages': self.message_data } # Create output directory if it doesn't exist output_dir = os.path.join(os.getcwd(), prefix) if not os.path.isdir(output_dir): os.mkdir(output_dir) # Set default filename if filename is None: filename = '{}-parsed.pkl'.format(self.conversation_id) output_filename = os.path.join(output_dir, filename) LOG_INFO('Serializing conversation data to "{}"'.format(output_filename)) # Serialize the conversation and dump it to file with open(output_filename, 'wb') as f: pickle.dump(hangouts_data, f) def main(file_path): # Validate raw data path if not os.path.exists(file_path): LOG_ERROR('Could not find file: {}'.format(file_path)) return # Validate raw data file type if not file_path.endswith('.json'): LOG_ERROR('File path must be a json file') return # Parse JSON with open(file_path, encoding='utf-8') as f: LOG_INFO('Parsing JSON file: {}'.format(file_path)) json_archive = json.load(f) if not 'conversation_state' in json_archive.keys(): LOG_ERROR('Could not find `conversation_state` in file {}'.format(file_path)) return # Parse each conversation for state in json_archive['conversation_state']: conv = Conversation(state) conv.parse() conv.print_summary() conv.serialize() LOG_INFO('Finished parsing conversations!') if __name__ == "__main__": LOG_INFO('Started script') parser = argparse.ArgumentParser() parser.add_argument('-f', '--file-path', default='raw/Hangouts.json', type=str, dest='file_path', help='Path to raw data file') parser.add_argument('-l', '--log-level', default=1, type=int, dest='log_level', help='Minimum logging level to output') args = parser.parse_args() utils.set_log_level(args.log_level) main(args.file_path)