Skip to content
This repository was archived by the owner on Sep 24, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions lib/facebook_data_analyzer/analyzeables/contacts.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ class Contacts < Analyzeable
attr_reader :contacts
def initialize(catalog:)
@catalog = catalog
@directory = "#{catalog}/html/"
@file_pattern = 'contact_info.htm'
@directory = "#{catalog}/about_you/"
@file_pattern = 'your_address_books.html'
@contacts = []

super()
Expand All @@ -17,14 +17,10 @@ def analyze
content = File.open(@file_pattern).read
doc = Nokogiri::HTML(content)

contacts_rows = doc.css('div.contents tr')
contacts_rows = doc.at_css('._4t5n').children

unique_contacts = contacts_rows.each_with_object({}) do |contact, seen_contacts|
text = contact.text

next if text == 'NameContacts'

seen_contacts[text] = Contact.parse(contact_text: text)
seen_contacts[contact.children[0].text] = Contact.parse(contact_text: contact)
end

unique_contacts.values.each do |contact|
Expand Down
6 changes: 3 additions & 3 deletions lib/facebook_data_analyzer/analyzeables/friends.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Friends < Analyzeable

def initialize(catalog:)
@catalog = catalog
@directory = "#{catalog}/html/"
@file_pattern = 'friends.htm'
@directory = "#{catalog}/friends/"
@file_pattern = 'friends.html'
@friends = []

super()
Expand All @@ -23,7 +23,7 @@ def analyze
Dir.chdir(@directory) do
content = File.open(@file_pattern).read
doc = Nokogiri::HTML(content)
friends_list = doc.css('div.contents > ul')[0].css('li')
friends_list = doc.at_css('._4t5n').children

friends_list.each do |friend_element|
friend = Friend.parse(friend_element: friend_element)
Expand Down
33 changes: 19 additions & 14 deletions lib/facebook_data_analyzer/analyzeables/messages.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,23 @@ class Messages < Analyzeable
def initialize(catalog:, options: {})
@verbose = options.fetch(:verbose) { false }
@catalog = catalog
@directory = "#{catalog}/messages"
@file_pattern = '*.html'
@directory = "#{catalog}/messages/inbox"
@file_pattern = '**/*.html'
@messages = []

# super(parallel: options.fetch(:parallel) { false })
super(parallel: options.fetch(:parallel, false))
end

def me
@me ||= Nokogiri::HTML(File.open("#{@catalog}/index.htm")).title.split(' - Profile')[0].to_sym
@me ||= Nokogiri::HTML(File.open("#{@catalog}/profile_information/profile_information.html")).css('table._4nkx td')[0].text.to_sym
end

def analyze
Dir.chdir(@directory) do
messages_files = Dir.glob(@file_pattern)
messages_files = Dir.glob(@file_pattern).map{|f| File.realpath(f)}

puts me

# This block will be skipped if all message files have already been parsed
::Parallel.each(messages_files, in_processes: @processes_supported, progress: 'Parsing Messages') do |file|
Expand All @@ -51,13 +53,13 @@ def analyze
content: message.content }
end

File.open("_#{file}.json", 'w') do |json|
File.open("#{file}.json", 'w') do |json|
json.write(conversation_messages.to_json)
end
end unless @verbose || (Dir.glob('_*.json').count == messages_files.count)
end unless @verbose || (Dir.glob('**/*.json').count == messages_files.count)

semaphore = Mutex.new
parsed_message_files = Dir.glob('_*.json')
parsed_message_files = Dir.glob('**/*.json')
::Parallel.each(parsed_message_files, in_threads: @threads_supported, progress: 'Analyzing Messages') do |json_file|
json_message_array = JSON.parse(File.read(json_file))
messages = json_message_array.map do |message|
Expand Down Expand Up @@ -340,21 +342,24 @@ def most_popular_english_words
def extract_messages(file:)
content = File.open(file)
doc = Nokogiri::HTML(content)
conversation_name = doc.title.split('Conversation with ')[1]
conversation_name = doc.title

#puts conversation_name

return [] if conversation_name.nil?

conversation = doc.at_css('.thread').children
conversation = doc.at_css('._4t5n').children
conversation_senders = []
conversation_contents = []
messages = []

conversation.each do |node|
if node.name == 'div' && node['class'] == 'message'
conversation_senders << node
elsif node.name == 'p'
# There are empty <p> as padding around images
conversation_contents << node unless node.children.count == 0
if node.name == 'div' && node['class'] == 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder' && node.children.count >= 3
conversation_senders << [node.children[0], node.children[2]]
conversation_contents << node.children[1]
#elsif node.name == 'p'
# # There are empty <p> as padding around images
# conversation_contents << node unless node.children.count == 0
end
end

Expand Down
5 changes: 3 additions & 2 deletions lib/facebook_data_analyzer/contact.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
module FacebookDataAnalyzer
class Contact
def self.parse(contact_text:)
contact_info = contact_text.split('contact: ')
name_text = contact_text.children[0].text
details = contact_text.children.drop(1)

Contact.new(name: String(contact_info[0]), details: contact_info[1..3].join(' '))
Contact.new(name: String(name_text), details: details.join(' '))
end

attr_reader :name, :details
Expand Down
9 changes: 2 additions & 7 deletions lib/facebook_data_analyzer/friend.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,8 @@ class Friend
attr_reader :name, :date_added

def self.parse(friend_element:)
friend_with_email = friend_element.text.match(/(.*)\s\((.*)\)\s\((.*)\)/)

if friend_with_email
name, date_added = friend_with_email.captures
else
name, date_added = friend_element.text.match(/(.*)\s\((.*)\)/).captures
end
name = friend_element.children[0].text
date_added = friend_element.children.last.text

date = if date_added == 'Today'
Date.today
Expand Down
9 changes: 2 additions & 7 deletions lib/facebook_data_analyzer/message.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,8 @@
module FacebookDataAnalyzer
class Message
def self.parse(sender_info:, content:, conversation:)
# To avoid searching, making a hash of child.name.child.class
hashed_children = {}
message_header = sender_info.children[0]
message_header.children.each { |c| hashed_children["#{c.name}.#{c['class']}"] = c }

sender = hashed_children['span.user'].text
date_sent = DateTime.parse(hashed_children['span.meta'].text)
sender = sender_info[0].text
date_sent = DateTime.parse(sender_info[1].text)
# There are some legit messages that are empty <p>'s for some reason
raw_content = (content&.text || 'messageremoved').downcase
# Removes everything that's not alphanumeric (except for spaces and $)
Expand Down