For data preperation I used Zichen Wang's Kaggle notebook Explore Enron as a reference.
import numpy as np
import email
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
emails = pd.read_csv('emails.csv')
print(emails.shape)
emails.head()
(517401, 2)
file | message | |
---|---|---|
0 | allen-p/_sent_mail/1. | Message-ID: <18782981.1075855378110.JavaMail.e... |
1 | allen-p/_sent_mail/10. | Message-ID: <15464986.1075855378456.JavaMail.e... |
2 | allen-p/_sent_mail/100. | Message-ID: <24216240.1075855687451.JavaMail.e... |
3 | allen-p/_sent_mail/1000. | Message-ID: <13505866.1075863688222.JavaMail.e... |
4 | allen-p/_sent_mail/1001. | Message-ID: <30922949.1075863688243.JavaMail.e... |
print(emails['message'][5])
Message-ID: <30965995.1075863688265.JavaMail.evans@thyme> Date: Thu, 31 Aug 2000 04:17:00 -0700 (PDT) From: phillip.allen@enron.com To: greg.piper@enron.com Subject: Re: Hello Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit X-From: Phillip K Allen X-To: Greg Piper X-cc: X-bcc: X-Folder: \Phillip_Allen_Dec2000\Notes Folders\'sent mail X-Origin: Allen-P X-FileName: pallen.nsf Greg, How about either next Tuesday or Thursday? Phillip
## Helper functions
def get_text_from_email(msg):
'''To get the content from email objects'''
parts = []
for part in msg.walk():
if part.get_content_type() == 'text/plain':
parts.append( part.get_payload() )
return ''.join(parts)
def split_email_addresses(line):
'''To separate multiple email addresses'''
if line:
addrs = line.split(',')
addrs = frozenset(map(lambda x: x.strip(), addrs))
else:
addrs = None
return addrs
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails['message']))
emails.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
emails[key] = [doc[key] for doc in messages]
# Parse content from emails
emails['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails['From'] = emails['From'].map(split_email_addresses)
emails['To'] = emails['To'].map(split_email_addresses)
# Extract the root of 'file' as 'user'
emails['user'] = emails['file'].map(lambda x:x.split('/')[0])
#strips beginning and end of whitespace and replaces all space characters with ' '
emails['content'] = emails['content'].apply(lambda x: x.strip()).str.replace(r'\s', ' ')
emails.head()
file | Message-ID | Date | From | To | Subject | Mime-Version | Content-Type | Content-Transfer-Encoding | X-From | X-To | X-cc | X-bcc | X-Folder | X-Origin | X-FileName | content | user | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | allen-p/_sent_mail/1. | <18782981.1075855378110.JavaMail.evans@thyme> | Mon, 14 May 2001 16:39:00 -0700 (PDT) | (phillip.allen@enron.com) | (tim.belden@enron.com) | 1.0 | text/plain; charset=us-ascii | 7bit | Phillip K Allen | Tim Belden <Tim Belden/Enron@EnronXGate> | \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... | Allen-P | pallen (Non-Privileged).pst | Here is our forecast | allen-p | |||
1 | allen-p/_sent_mail/10. | <15464986.1075855378456.JavaMail.evans@thyme> | Fri, 4 May 2001 13:51:00 -0700 (PDT) | (phillip.allen@enron.com) | (john.lavorato@enron.com) | Re: | 1.0 | text/plain; charset=us-ascii | 7bit | Phillip K Allen | John J Lavorato <John J Lavorato/ENRON@enronXg... | \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se... | Allen-P | pallen (Non-Privileged).pst | Traveling to have a business meeting takes the... | allen-p | ||
2 | allen-p/_sent_mail/100. | <24216240.1075855687451.JavaMail.evans@thyme> | Wed, 18 Oct 2000 03:00:00 -0700 (PDT) | (phillip.allen@enron.com) | (leah.arsdall@enron.com) | Re: test | 1.0 | text/plain; charset=us-ascii | 7bit | Phillip K Allen | Leah Van Arsdall | \Phillip_Allen_Dec2000\Notes Folders\'sent mail | Allen-P | pallen.nsf | test successful. way to go!!! | allen-p | ||
3 | allen-p/_sent_mail/1000. | <13505866.1075863688222.JavaMail.evans@thyme> | Mon, 23 Oct 2000 06:13:00 -0700 (PDT) | (phillip.allen@enron.com) | (randall.gay@enron.com) | 1.0 | text/plain; charset=us-ascii | 7bit | Phillip K Allen | Randall L Gay | \Phillip_Allen_Dec2000\Notes Folders\'sent mail | Allen-P | pallen.nsf | Randy, Can you send me a schedule of the sal... | allen-p | |||
4 | allen-p/_sent_mail/1001. | <30922949.1075863688243.JavaMail.evans@thyme> | Thu, 31 Aug 2000 05:07:00 -0700 (PDT) | (phillip.allen@enron.com) | (greg.piper@enron.com) | Re: Hello | 1.0 | text/plain; charset=us-ascii | 7bit | Phillip K Allen | Greg Piper | \Phillip_Allen_Dec2000\Notes Folders\'sent mail | Allen-P | pallen.nsf | Let's shoot for Tuesday at 11:45. | allen-p |
print('Dataframe Shape:', emails.shape)
# Find number of unique values in each columns
for col in emails.columns:
print(col, emails[col].nunique())
Dataframe Shape: (517401, 18) file 517401 Message-ID 517401 Date 224128 From 20328 To 54748 Subject 159290 Mime-Version 1 Content-Type 2 Content-Transfer-Encoding 3 X-From 27980 X-To 73552 X-cc 33701 X-bcc 132 X-Folder 5335 X-Origin 259 X-FileName 429 content 247696 user 150
# Set index and drop columns with two few values
emails = emails.set_index('Message-ID')\
.drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
# Parse datetime
emails['Date'] = pd.to_datetime(emails['Date'], infer_datetime_format=True)
emails.dtypes
Date datetime64[ns] From object To object Subject object X-From object X-To object X-cc object X-bcc object X-Folder object X-Origin object X-FileName object content object user object dtype: object
Only @enron.com emails
Remove Duplicates, Forwards, and CCs
Seperate out Emails with one Reciepient
Take out emails accounts that do not represent a person's communication
emails.columns
Index(['Date', 'From', 'To', 'Subject', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'content', 'user'], dtype='object')
# narrow down columns needed for network and linguistic analysis
data = emails[['Date', 'From', 'To', 'Subject', 'content']]
data = data.reset_index()
data.head()
Message-ID | Date | From | To | Subject | content | |
---|---|---|---|---|---|---|
0 | <18782981.1075855378110.JavaMail.evans@thyme> | 2001-05-14 23:39:00 | (phillip.allen@enron.com) | (tim.belden@enron.com) | Here is our forecast | |
1 | <15464986.1075855378456.JavaMail.evans@thyme> | 2001-05-04 20:51:00 | (phillip.allen@enron.com) | (john.lavorato@enron.com) | Re: | Traveling to have a business meeting takes the... |
2 | <24216240.1075855687451.JavaMail.evans@thyme> | 2000-10-18 10:00:00 | (phillip.allen@enron.com) | (leah.arsdall@enron.com) | Re: test | test successful. way to go!!! |
3 | <13505866.1075863688222.JavaMail.evans@thyme> | 2000-10-23 13:13:00 | (phillip.allen@enron.com) | (randall.gay@enron.com) | Randy, Can you send me a schedule of the sal... | |
4 | <30922949.1075863688243.JavaMail.evans@thyme> | 2000-08-31 12:07:00 | (phillip.allen@enron.com) | (greg.piper@enron.com) | Re: Hello | Let's shoot for Tuesday at 11:45. |
def non_enron(froze_address):
"""returns True for where there are no enron email addresses in sent"""
if len(froze_address)==1:
for i in froze_address:
if i[-9:]=="enron.com":
return False
else:
return True
else:
for i in froze_address:
if i[-9:]=="enron.com":
return False
"there are no enron emails there"
return True
#test function
test=frozenset(['phil@oldman.com', 'jimmy@exon.com', 'tiffany@walk.com', 'lotsofpeople@enron.com'])
non_enron(test)
False
# data we need to do centrality measure
#'Date' and index will be used to stop repeats
n_data = data[['Message-ID','Date','From', 'To', 'Subject', 'content']].dropna()
#selects only intra-enron emails
n_data=n_data.drop(n_data[n_data.From.apply(non_enron)].index)
n_data=n_data.drop(n_data[n_data.To.apply(non_enron)].index)
n_data.head()
Message-ID | Date | From | To | Subject | content | |
---|---|---|---|---|---|---|
0 | <18782981.1075855378110.JavaMail.evans@thyme> | 2001-05-14 23:39:00 | (phillip.allen@enron.com) | (tim.belden@enron.com) | Here is our forecast | |
1 | <15464986.1075855378456.JavaMail.evans@thyme> | 2001-05-04 20:51:00 | (phillip.allen@enron.com) | (john.lavorato@enron.com) | Re: | Traveling to have a business meeting takes the... |
2 | <24216240.1075855687451.JavaMail.evans@thyme> | 2000-10-18 10:00:00 | (phillip.allen@enron.com) | (leah.arsdall@enron.com) | Re: test | test successful. way to go!!! |
3 | <13505866.1075863688222.JavaMail.evans@thyme> | 2000-10-23 13:13:00 | (phillip.allen@enron.com) | (randall.gay@enron.com) | Randy, Can you send me a schedule of the sal... | |
4 | <30922949.1075863688243.JavaMail.evans@thyme> | 2000-08-31 12:07:00 | (phillip.allen@enron.com) | (greg.piper@enron.com) | Re: Hello | Let's shoot for Tuesday at 11:45. |
#drop duplicates that have the same Date the same sender and reciepient
n_data = n_data.drop_duplicates(subset=['Date', 'From', 'To'])
#remove forwarded emails
print(len(n_data))
n_data = n_data.drop(labels=n_data.loc[n_data['content'].str.startswith("---------------------- Forwarded by")].index)
n_data = n_data.drop(labels=n_data.loc[n_data['content'].str.startswith("\n---------------------- Forwarded by")].index)
n_data = n_data.drop(labels=n_data.loc[n_data['content'].str.startswith("----- Forwarded by")].index)
n_data = n_data.drop(labels=n_data.loc[n_data['content'].str.startswith(" \n----Forwarded by")].index)
print(len(n_data))
160950 153362
# remove email chains leaving just the most recent message
n_data['content'] = n_data['content'].str.split(pat="Forwarded by")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="@ECT")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="\nTo:")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="@ENRON")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="-Original Message-")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="\n\t\n\tFrom:")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="\n\n\nFrom:")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat=" \n\t\t To: ")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat="\n\tSent by:")
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat='\n\n>>>')
n_data['content'] = n_data['content'].apply(lambda x: x[0])
n_data['content'] = n_data['content'].str.split(pat='\n=09')
n_data['content'] = n_data['content'].apply(lambda x: x[0])
private = n_data.loc[n_data.To.apply(len)==1] # 1 to 1 correspondants
m_recipients = n_data.loc[n_data.To.apply(len)>1] # 1 to n correspondents
len(private), len(m_recipients)
(104893, 48469)
# column containing the number of recipients
m_recipients['n_recipients']= m_recipients['To'].apply(lambda x: len(x))
C:\Users\benti\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Some emails that are meant for a mass audience might not suit our analysis.
m_recipients= m_recipients.loc[m_recipients['n_recipients']<47]
#iterate over dataframe and create a new dataframe with single relations and same Message ID takes a lot of time
hold_d = {'Message-ID':[], 'Date':[],
'From':[], 'To':[],
'content':[], 'n_recipients':[]}
for index, row in m_recipients.iterrows():
g = row
for i in row['To']:
hold_d['Message-ID'].append(g['Message-ID'])
hold_d['Date'].append(g['Date'])
hold_d['From'].append(g['From'])
hold_d['To'].append(frozenset([i]))
hold_d['content'].append(g['content'])
hold_d['n_recipients'].append(g['n_recipients'])
m_emails = pd.DataFrame.from_dict(hold_d)
private['n_recipients'] = 1
C:\Users\benti\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
email_exchanges = pd.concat([private, m_emails])
email_exchanges.head()
C:\Users\benti\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'. """Entry point for launching an IPython kernel.
Date | From | Message-ID | Subject | To | content | n_recipients | |
---|---|---|---|---|---|---|---|
0 | 2001-05-14 23:39:00 | (phillip.allen@enron.com) | <18782981.1075855378110.JavaMail.evans@thyme> | (tim.belden@enron.com) | Here is our forecast | 1 | |
1 | 2001-05-04 20:51:00 | (phillip.allen@enron.com) | <15464986.1075855378456.JavaMail.evans@thyme> | Re: | (john.lavorato@enron.com) | Traveling to have a business meeting takes the... | 1 |
2 | 2000-10-18 10:00:00 | (phillip.allen@enron.com) | <24216240.1075855687451.JavaMail.evans@thyme> | Re: test | (leah.arsdall@enron.com) | test successful. way to go!!! | 1 |
3 | 2000-10-23 13:13:00 | (phillip.allen@enron.com) | <13505866.1075863688222.JavaMail.evans@thyme> | (randall.gay@enron.com) | Randy, Can you send me a schedule of the sal... | 1 | |
4 | 2000-08-31 12:07:00 | (phillip.allen@enron.com) | <30922949.1075863688243.JavaMail.evans@thyme> | Re: Hello | (greg.piper@enron.com) | Let's shoot for Tuesday at 11:45. | 1 |
#remove emails with no content
email_exchanges = email_exchanges.loc[email_exchanges.content.apply(lambda x: len(x))>0]
email_exchanges[['From', 'To', 'Date']]\
.groupby(['From', 'To'])\
.count().reset_index()\
.sort_values('Date', ascending=False)\
.head(20)
From | To | Date | |
---|---|---|---|
38347 | (pete.davis@enron.com) | (pete.davis@enron.com) | 3904 |
50694 | (jeff.dasovich@enron.com) | (susan.mara@enron.com) | 662 |
50843 | (jeff.dasovich@enron.com) | (richard.shapiro@enron.com) | 581 |
50718 | (jeff.dasovich@enron.com) | (paul.kaufman@enron.com) | 579 |
114263 | (michelle.nelson@enron.com) | (mike.maggi@enron.com) | 494 |
50714 | (jeff.dasovich@enron.com) | (james.steffes@enron.com) | 478 |
67482 | (mike.maggi@enron.com) | (michelle.nelson@enron.com) | 452 |
50752 | (jeff.dasovich@enron.com) | (karen.denne@enron.com) | 433 |
58313 | (kay.mann@enron.com) | (suzanne.adams@enron.com) | 428 |
54544 | (susan.mara@enron.com) | (jeff.dasovich@enron.com) | 364 |
86815 | (vince.kaminski@enron.com) | (shirley.crenshaw@enron.com) | 306 |
50877 | (jeff.dasovich@enron.com) | (skean@enron.com) | 279 |
50846 | (jeff.dasovich@enron.com) | (harry.kingerski@enron.com) | 278 |
25179 | (evelyn.metoyer@enron.com) | (kate.symes@enron.com) | 272 |
33868 | (leslie.hansen@enron.com) | (tana.jones@enron.com) | 268 |
58308 | (kay.mann@enron.com) | (ben.jacoby@enron.com) | 266 |
117026 | (bill.williams@enron.com) | (kate.symes@enron.com) | 255 |
58273 | (kay.mann@enron.com) | (kathleen.carnahan@enron.com) | 250 |
50748 | (jeff.dasovich@enron.com) | (d..steffes@enron.com) | 242 |
60491 | (sara.shackleton@enron.com) | (susan.bailey@enron.com) | 238 |
There are many accounts which are not of interest to our networks. Remove edges relating to nodes selfs.
remove_set = frozenset(['pete.davis@enron.com',
'enron.announcements@enron.com',
'all.worldwide@enron.com',
'all.houston@enron.com',
'houston.report@enron.com',
'all.states@enron.com',
'outlook.team@enron.com',
'administration.enron@enron.com',
'dl-ga-all_enron_worldwide2@enron.com',
'exchange.administrator@enron.com',
'all_ena_egm_eim@enron.com',
'perfmgmt@enron.com',
'chairman.ken@enron.com',
'enron.services@enron.com',
'enron.expertfinder@enron.com',
'europe.eol@enron.com'
])
email_exchanges = email_exchanges.drop(index=email_exchanges.loc[email_exchanges.From <= remove_set].index)
email_exchanges = email_exchanges.drop(index=email_exchanges.loc[email_exchanges.To <= remove_set].index)
email_exchanges.From.value_counts().describe()
count 6143.000000 mean 71.571545 std 278.972687 min 1.000000 25% 2.000000 50% 8.000000 75% 40.000000 max 9448.000000 Name: From, dtype: float64
email_exchanges.To.value_counts().describe()
count 15332.000000 mean 28.676233 std 121.117071 min 1.000000 25% 1.000000 50% 3.000000 75% 13.000000 max 4150.000000 Name: To, dtype: float64
email_exchanges = email_exchanges.reset_index(drop=True)
# Write to json or some file which will be easier to export
#all interesting analysis is below
email_exchanges.to_json("email_exchanges.json")