diff --git a/lib/facebook_data_analyzer/analyzeables/contacts.rb b/lib/facebook_data_analyzer/analyzeables/contacts.rb index ae6e080..0276b8c 100644 --- a/lib/facebook_data_analyzer/analyzeables/contacts.rb +++ b/lib/facebook_data_analyzer/analyzeables/contacts.rb @@ -5,8 +5,8 @@ class Contacts < Analyzeable attr_reader :contacts def initialize(catalog:) @catalog = catalog - @directory = "#{catalog}/html/" - @file_pattern = 'contact_info.htm' + @directory = "#{catalog}/about_you/" + @file_pattern = 'your_address_books.html' @contacts = [] super() @@ -17,14 +17,10 @@ def analyze content = File.open(@file_pattern).read doc = Nokogiri::HTML(content) - contacts_rows = doc.css('div.contents tr') + contacts_rows = doc.at_css('._4t5n').children unique_contacts = contacts_rows.each_with_object({}) do |contact, seen_contacts| - text = contact.text - - next if text == 'NameContacts' - - seen_contacts[text] = Contact.parse(contact_text: text) + seen_contacts[contact.children[0].text] = Contact.parse(contact_text: contact) end unique_contacts.values.each do |contact| diff --git a/lib/facebook_data_analyzer/analyzeables/friends.rb b/lib/facebook_data_analyzer/analyzeables/friends.rb index 0853345..5e47616 100644 --- a/lib/facebook_data_analyzer/analyzeables/friends.rb +++ b/lib/facebook_data_analyzer/analyzeables/friends.rb @@ -12,8 +12,8 @@ class Friends < Analyzeable def initialize(catalog:) @catalog = catalog - @directory = "#{catalog}/html/" - @file_pattern = 'friends.htm' + @directory = "#{catalog}/friends/" + @file_pattern = 'friends.html' @friends = [] super() @@ -23,7 +23,7 @@ def analyze Dir.chdir(@directory) do content = File.open(@file_pattern).read doc = Nokogiri::HTML(content) - friends_list = doc.css('div.contents > ul')[0].css('li') + friends_list = doc.at_css('._4t5n').children friends_list.each do |friend_element| friend = Friend.parse(friend_element: friend_element) diff --git a/lib/facebook_data_analyzer/analyzeables/messages.rb b/lib/facebook_data_analyzer/analyzeables/messages.rb index 73f12db..620fac9 100755 --- a/lib/facebook_data_analyzer/analyzeables/messages.rb +++ b/lib/facebook_data_analyzer/analyzeables/messages.rb @@ -26,8 +26,8 @@ class Messages < Analyzeable def initialize(catalog:, options: {}) @verbose = options.fetch(:verbose) { false } @catalog = catalog - @directory = "#{catalog}/messages" - @file_pattern = '*.html' + @directory = "#{catalog}/messages/inbox" + @file_pattern = '**/*.html' @messages = [] # super(parallel: options.fetch(:parallel) { false }) @@ -35,12 +35,14 @@ def initialize(catalog:, options: {}) end def me - @me ||= Nokogiri::HTML(File.open("#{@catalog}/index.htm")).title.split(' - Profile')[0].to_sym + @me ||= Nokogiri::HTML(File.open("#{@catalog}/profile_information/profile_information.html")).css('table._4nkx td')[0].text.to_sym end def analyze Dir.chdir(@directory) do - messages_files = Dir.glob(@file_pattern) + messages_files = Dir.glob(@file_pattern).map{|f| File.realpath(f)} + + puts me # This block will be skipped if all message files have already been parsed ::Parallel.each(messages_files, in_processes: @processes_supported, progress: 'Parsing Messages') do |file| @@ -51,13 +53,13 @@ def analyze content: message.content } end - File.open("_#{file}.json", 'w') do |json| + File.open("#{file}.json", 'w') do |json| json.write(conversation_messages.to_json) end - end unless @verbose || (Dir.glob('_*.json').count == messages_files.count) + end unless @verbose || (Dir.glob('**/*.json').count == messages_files.count) semaphore = Mutex.new - parsed_message_files = Dir.glob('_*.json') + parsed_message_files = Dir.glob('**/*.json') ::Parallel.each(parsed_message_files, in_threads: @threads_supported, progress: 'Analyzing Messages') do |json_file| json_message_array = JSON.parse(File.read(json_file)) messages = json_message_array.map do |message| @@ -340,21 +342,24 @@ def most_popular_english_words def extract_messages(file:) content = File.open(file) doc = Nokogiri::HTML(content) - conversation_name = doc.title.split('Conversation with ')[1] + conversation_name = doc.title + + #puts conversation_name return [] if conversation_name.nil? - conversation = doc.at_css('.thread').children + conversation = doc.at_css('._4t5n').children conversation_senders = [] conversation_contents = [] messages = [] conversation.each do |node| - if node.name == 'div' && node['class'] == 'message' - conversation_senders << node - elsif node.name == 'p' - # There are empty

as padding around images - conversation_contents << node unless node.children.count == 0 + if node.name == 'div' && node['class'] == 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder' && node.children.count >= 3 + conversation_senders << [node.children[0], node.children[2]] + conversation_contents << node.children[1] + #elsif node.name == 'p' + # # There are empty

as padding around images + # conversation_contents << node unless node.children.count == 0 end end diff --git a/lib/facebook_data_analyzer/contact.rb b/lib/facebook_data_analyzer/contact.rb index aee988f..ef46a9a 100644 --- a/lib/facebook_data_analyzer/contact.rb +++ b/lib/facebook_data_analyzer/contact.rb @@ -3,9 +3,10 @@ module FacebookDataAnalyzer class Contact def self.parse(contact_text:) - contact_info = contact_text.split('contact: ') + name_text = contact_text.children[0].text + details = contact_text.children.drop(1) - Contact.new(name: String(contact_info[0]), details: contact_info[1..3].join(' ')) + Contact.new(name: String(name_text), details: details.join(' ')) end attr_reader :name, :details diff --git a/lib/facebook_data_analyzer/friend.rb b/lib/facebook_data_analyzer/friend.rb index d843667..34625b5 100644 --- a/lib/facebook_data_analyzer/friend.rb +++ b/lib/facebook_data_analyzer/friend.rb @@ -5,13 +5,8 @@ class Friend attr_reader :name, :date_added def self.parse(friend_element:) - friend_with_email = friend_element.text.match(/(.*)\s\((.*)\)\s\((.*)\)/) - - if friend_with_email - name, date_added = friend_with_email.captures - else - name, date_added = friend_element.text.match(/(.*)\s\((.*)\)/).captures - end + name = friend_element.children[0].text + date_added = friend_element.children.last.text date = if date_added == 'Today' Date.today diff --git a/lib/facebook_data_analyzer/message.rb b/lib/facebook_data_analyzer/message.rb index 04c6ca9..19c590f 100644 --- a/lib/facebook_data_analyzer/message.rb +++ b/lib/facebook_data_analyzer/message.rb @@ -3,13 +3,8 @@ module FacebookDataAnalyzer class Message def self.parse(sender_info:, content:, conversation:) - # To avoid searching, making a hash of child.name.child.class - hashed_children = {} - message_header = sender_info.children[0] - message_header.children.each { |c| hashed_children["#{c.name}.#{c['class']}"] = c } - - sender = hashed_children['span.user'].text - date_sent = DateTime.parse(hashed_children['span.meta'].text) + sender = sender_info[0].text + date_sent = DateTime.parse(sender_info[1].text) # There are some legit messages that are empty

's for some reason raw_content = (content&.text || 'messageremoved').downcase # Removes everything that's not alphanumeric (except for spaces and $)