This repository was archived by the owner on Oct 1, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathparse.py
74 lines (58 loc) · 2.42 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import sys
import csv
import pandas as pd
from bs4 import BeautifulSoup
class FeedParser(object):
def __init__(self, html_path, csv_dest):
self.feed_html = None
self.soup = None
self.dest = csv_dest
self.load_feed_html(html_path)
def load_feed_html(self, html_path):
with open(html_path, 'r') as html:
self.feed_html = html
self.soup = BeautifulSoup(html, 'lxml')
def get_feed_items(self):
if self.soup is None:
print("no soup available")
return
return self.soup.find_all('div', {'class': 'feed-item'})
def create_csv(self):
fields_to_write = ['title', 'designer', 'size', 'price', 'original_price', 'age', 'bumped']
with open(csv_dest, 'w') as csvfile:
writer = csv.writer(csvfile)
for feed_item_html in self.get_feed_items():
if 'empty-item' not in feed_item_html.attrs['class']:
try:
item = self.extract_feed_item_fields(feed_item_html)
writer.writerow([item[field_name] for field_name in fields_to_write])
except:
print("Something wrong happened writing data")
def extract_feed_item_fields(self, feed_item_html):
result = {}
f = feed_item_html
listing_age = f.select('h3 > .date-ago')
origin = f.find('span', 'strike-through')
if len(listing_age) == 2:
result['age'] = listing_age[1].find('span', 'strike-through').text
result['bumped'] = listing_age[0].text
else:
result['age'] = listing_age[0].text
result['bumped'] = None
result['designer'] = f.find('h3', 'listing-designer').text
result['size'] = f.find('h3', 'listing-size').text
result['title'] = f.find('h3', 'listing-title').text
marked_down = f.find('h3', 'new-price') is not None
result['original_price'] = f.find('h3', 'original-price').text
result['price'] = f.find('h3', 'new-price').text if marked_down else result['original_price']
return result
if __name__ == '__main__':
feed_html = './feed.html'
csv_dest = './feed.csv'
for arg in sys.argv:
if arg.endswith('.html'):
feed_html = arg
elif arg.endswith('.csv'):
csv_dest = arg
dp = FeedParser(feed_html, csv_dest)
dp.create_csv()