rimbi · sardok · Sep 17, 2010 · Sep 17, 2010 · Sep 19, 2010 · Sep 19, 2010
diff --git a/README b/README
@@ -0,0 +1 @@
+Bookcrawler branch which works on google appengine.
diff --git a/chrome-extension/contentscript.js b/chrome-extension/contentscript.js
@@ -1,4 +1,8 @@
 
+var re = /ISBN[:\s]*([X0-9\\-]*)/g;
+rawISBN = document.body.innerText.match(re)[0];
+isbn = rawISBN.replace(/-/g, "").slice(-10, -1)
+
 function  createRootElement(id) {
 	root = document.createElement("div");
 	root.id = id;
@@ -46,9 +50,6 @@ function showBooks(responseText) {
 	}
 }
 
-var re = /ISBN[:\s]*([0-9\\-]*)/g;
-rawISBN = document.body.innerText.match(re)[0];
-isbn = rawISBN.replace(/-/g, "").slice(-10)
 //alert(isbn);
 chrome.extension.sendRequest({'action' : 'fetchBooks', 'selectedText' : isbn}, showBooks);
 
diff --git a/chrome-extension/icon.png b/chrome-extension/icon.png
diff --git a/chrome-extension/icon_128.png b/chrome-extension/icon_128.png
diff --git a/chrome-extension/index.html b/chrome-extension/index.html
@@ -4,7 +4,7 @@
 <body>
 <script type="text/javascript">
 function sendServiceRequest(selectedText, callback) {
-    var serviceCall = "http://127.0.0.1:8000/myapp/default/query.xml?column_name=isbn&query_string=" + selectedText;
+    var serviceCall = "http://rimbiskitapsever.appspot.com/bookbyisbn?isbn=" + selectedText;
     var req = new XMLHttpRequest();
     req.open("GET", serviceCall, true);
     req.onload = showBooks;

diff --git a/chrome-extension/manifest.json b/chrome-extension/manifest.json
@@ -1,14 +1,10 @@
 {
   "name": "Kitapsever",
-  "version": "1.0",
+  "version": "0.1.1",
   "description": "Kitapsever için en uygun kitabı bulur.",
-//  "browser_action": {
-//    "default_icon": "icon.png"
-//  },
 
   "icons": {
-    "48" : "icon.png",
-    "128" : "icon.png"
+    "128" : "icon_128.png"
   },
   "background_page" : "index.html",
   "permissions": [

diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py
@@ -5,7 +5,7 @@
 # http://doc.scrapy.org/topics/items.html
 
 from scrapy.item import Item, Field
-from scrapy.contrib.loader.processor import Join, TakeFirst
+from scrapy.contrib.loader.processor import Join, TakeFirst, Compose
 
 class BookItem(Item):
     # define the fields for your item here like:
@@ -32,6 +32,7 @@ class BookItem(Item):
     price = Field(
             default = u'0 TL',
             output_processor = TakeFirst(),
+            input_processor = Compose(lambda v: v[-1:]),
     )
     store = Field(
             default = 0,

diff --git a/crawler/crawler/pipelines.py b/crawler/crawler/pipelines.py
@@ -6,74 +6,62 @@
 from scrapy.xlib.pydispatch import dispatcher
 from scrapy.core import signals
 from string import replace
-from sqlalchemy import create_engine
-from sqlalchemy import Table, Column, Integer, Float, Unicode, MetaData, and_
-from sqlalchemy.orm import mapper, sessionmaker
+from crawler.settings import BOOK_SERVICE_ADDRESS
+import urllib
+from scrapy.core.exceptions import DropItem
 
-class Book(object):
-	def __init__(self, name, isbn, author, publisher, link, price, store):
-		self.name = name
-		self.isbn = isbn
-		self.author = author
-		self.publisher = publisher
-		self.link = link
-		self.price = price
-		self.store = store
+ITEM_SEPERATOR = ";"
 
-	def __repr__(self):
-		return u"<Book('%s', '%s', '%s', '%s', '%s', '%f' '%d')>" % (self.name, self.isbn, self.author, self.publisher, self.link, self.price, self.store)
-
-metadata = MetaData()
-
-books_table = Table('books', metadata,
-			Column('id', Integer, primary_key=True),
-			Column('isbn', Unicode(255)),
-			Column('name', Unicode(255)),
-			Column('author', Unicode(255)),
-			Column('publisher', Unicode(255)),
-			Column('link', Unicode(255)),
-			Column('price', Float(precision=2)),
-			Column('store', Integer))
-
-mapper(Book, books_table)
+class AppEngineExportPipeline(object):
+	def process_item(self, spider, item):
+		try:
+			link  = item['link'].strip()
+			isbn = item['isbn'].strip().replace("-", "")
+			if len(isbn) >= 10:
+				isbn = isbn[-10:-1]
+			price = replace(item['price'], ',', '.')
+			store = str(item['store'])
+			line  = isbn + ITEM_SEPERATOR
+			line  = line + link + ITEM_SEPERATOR
+			line  = line + price + ITEM_SEPERATOR
+			line  = line + store + "\n"
+			params = urllib.urlencode({'isbn': isbn, 'price': price, 'store': store, 'link': link})
+			f = urllib.urlopen(BOOK_SERVICE_ADDRESS + '?%s' % params)
+			f.close()
+		except AttributeError:
+			print "Attribute error in parsing item at %s" % link
+			raise DropItem()
 
-class DbExportPipeline(object):
-	i = 0
+		return item
+
+class FileExportPipeline(object):
 	def __init__(self):
 		dispatcher.connect(self.spider_opened, signals.spider_opened)
 		dispatcher.connect(self.spider_closed, signals.spider_closed)
-		self.session = None
+		self.out_file = None
 
 	def spider_opened(self, spider):
-		self.session = sessionmaker(bind=create_engine('mysql://root:123456@localhost/bookcrawler', echo=True))()
-		DbExportPipeline.i += 1
+		self.out_file = open(spider.domain_name + ".txt", "w")
 
 	def spider_closed(self, spider):
-		DbExportPipeline.i -= 1
-		if DbExportPipeline.i == 0:
-			self.session.close()
+		self.out_file.close()
 
 	def process_item(self, spider, item):
-		book_isbn = item['isbn'].strip().replace("-", "")
-		if len(book_isbn) == 13:
-			book_isbn = book_isbn[-10:]
-		book_name	   	= unicode(item['name'].strip())
-		book_author		= unicode(item['author'].strip())
-		book_publisher  	= unicode(item['publisher'].strip())
-		book_link	   	= unicode(item['link'].strip())
-		book_price	  	= float(replace(item['price'], ',', '.'))
-		book_store	  	= item['store']
-		book = self.session.query(Book).filter(and_(Book.isbn == book_isbn, Book.store == book_store)).first()
-		if book is None:
-			book = Book(book_name, book_isbn, book_author, book_publisher, book_link, book_price, book_store)
-			self.session.add(book)
-		else:
-			book.price = book_price
-			book.name = book_name
-			book.author = book_author
-			book.publisher = book_publisher
-			book.link = book_link
-		self.session.flush()
-		self.session.commit()
+		try:
+			link  = item['link'].strip()
+			isbn = item['isbn'].strip().replace("-", "")
+			if len(isbn) >= 10:
+				isbn = isbn[-10: -1]
+			price = replace(item['price'], ',', '.')
+			store = str(item['store'])
+			line  = isbn + ITEM_SEPERATOR
+			line  = line + link + ITEM_SEPERATOR
+			line  = line + price + ITEM_SEPERATOR
+			line  = line + store + "\n"
+			self.out_file.write(line)
+		except AttributeError:
+			print "Attribute error in parsing item at %s" % link
+			raise DropItem()
+
 		return item
 
diff --git a/crawler/crawler/settings.py b/crawler/crawler/settings.py
@@ -18,6 +18,11 @@
 DEFAULT_ITEM_CLASS = 'crawler.items.BookItem'
 #USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
 USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13'
-ITEM_PIPELINES = ['crawler.pipelines.DbExportPipeline']
+ITEM_PIPELINES = [
+#				  'crawler.pipelines.FileExportPipeline', 
+				  'crawler.pipelines.AppEngineExportPipeline'
+				  ]
 CONCURRENT_REQUESTS_PER_SPIDER = 1
-DOWNLOAD_DELAY = 2
+DOWNLOAD_DELAY = 1
+BOOK_SERVICE_ADDRESS = 'http://rimbiskitapsever.appspot.com/book'
+#BOOK_SERVICE_ADDRESS = 'http://localhost:8080/book'
diff --git a/crawler/crawler/spiders/ideefixe.py b/crawler/crawler/spiders/ideefixe.py
@@ -20,10 +20,10 @@ class IdefixSpider(CrawlSpider):
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
 		l.add_xpath('name',     '//div[@class=\'boxTanimisim\']/div/text()')
-		l.add_xpath('isbn',     '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9]+)')
+		l.add_xpath('isbn',     '//div[@id=\'tanitimbox\']/text()', u'.*ISBN : ([0-9X]+)')
 		l.add_xpath('author',   '//div[@class=\'boxTanimVideo\']/a/text()')
 		l.add_xpath('publisher','//h3[@class=\'boxTanimyayinevi\']/a/b/text()')
-		l.add_xpath('price',    '//b[@class=\'pricerange\']/text()', u'\s*(.*) TL \(KDV Dahil\)')
+		l.add_xpath('price',    '//b[@class=\'pricerange\']/text()', u'\s*([0-9,]*) TL \(KDV Dahil\)')
 		l.add_value('link', response.url)
 		l.add_value('store', 2)
 		return l.load_item()

diff --git a/crawler/crawler/spiders/ilknokta.py b/crawler/crawler/spiders/ilknokta.py
@@ -13,14 +13,14 @@ class IlknoktaSpider(CrawlSpider):
     start_urls = ['http://www.ilknokta.com/']
 
     rules = (
-        Rule(SgmlLinkExtractor(allow=(r'/urun/.*', ), unique=True), 'parse_item', follow=True),
+        Rule(SgmlLinkExtractor(allow=(r'/kitap/.*', ), unique=True), 'parse_item', follow=True),
         Rule(SgmlLinkExtractor(allow=(r'/.*', ), unique=True), ),
     )
 
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
-		l.add_xpath('name',     '//font[@class=\'baslikt\']/strong/text()')
-		l.add_xpath('isbn',     '//td/text()', u'.*ISBN: ([0-9\-]+)')
+		l.add_xpath('name',     '//div[@class="divbaslik"]/@title')
+		l.add_xpath('isbn',     '//td/text()', u'.*ISBN: ([0-9\-X]+)')
 		l.add_xpath('author',   '//td[@class=\'yazart\']/a/text()')
 		l.add_xpath('publisher','//a[@class=\'yayineviU\']/text()')
 		l.add_xpath('price',    '//font[@class=\'fiyat\']/text()', u'([0-9,]+) TL')

diff --git a/crawler/crawler/spiders/imge.py b/crawler/crawler/spiders/imge.py
@@ -19,8 +19,8 @@ class ImgeSpider(CrawlSpider):
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
 		l.add_xpath('name',     '//td[@class=\'pageHeading\']/text()')
-		l.add_xpath('isbn',     '//td[@class=\'main\']/text()', u'ISBN: ([0-9]+)')
-		l.add_xpath('isbn',     '//td[@class=\'main\']/p/text()', u'ISBN: ([0-9]+)')
+		l.add_xpath('isbn',     '//td[@class=\'main\']/text()', u'ISBN: ([0-9X]+)')
+		l.add_xpath('isbn',     '//td[@class=\'main\']/p/text()', u'ISBN: ([0-9X]+)')
 		l.add_xpath('isbn',     '//td[@class=\'main\']/p/text()', u'Barkod: ([0-9]+)')
 		l.add_xpath('author',   '//a[contains(@href, "/person.php")]/b/font/text()')
 		l.add_xpath('publisher','//a[contains(@href, "manufacturers_id=")]/b/font/text()')

diff --git a/crawler/crawler/spiders/kitapyurdu.py b/crawler/crawler/spiders/kitapyurdu.py
@@ -19,10 +19,10 @@ class KitapyurduSpider(CrawlSpider):
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
 		l.add_xpath('name',     '//span[@class=\'kitapismi\']/text()')
-		l.add_xpath('isbn',     '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9]+)')
+		l.add_xpath('isbn',     '//span[@class=\'normalkucuk\']/text()', u'ISBN:([0-9X]+)')
 		l.add_xpath('author',   '//span/a[contains(@href, "/yazar/")]/text()')
 		l.add_xpath('publisher','//span/a[contains(@href, "/yayinevi/")]/text()')
-		l.add_xpath('price',    '//td/text()', u'Kitapyurdu Fiyatı:(.*) TL\.')
+		l.add_xpath('price',    '//td/text()', u'Kitapyurdu Fiyatı:\s([0-9,]*).*')
 		l.add_value('link', response.url)
 		l.add_value('store', 3)
 		return l.load_item()

diff --git a/crawler/crawler/spiders/netkitap.py b/crawler/crawler/spiders/netkitap.py
@@ -19,11 +19,10 @@ class NetkitapSpider(CrawlSpider):
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
 		l.add_xpath('name',     '//h1[@class=\'kitapad14pnt\']/b/text()')
-		l.add_xpath('isbn',     '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-]+)')
+		l.add_xpath('isbn',     '//span[@class=\'kunye\']/text()', u'ISBN: ([0-9\-X]+)')
 		l.add_xpath('author',   '//span[@class=\'yazarad12pnt\']/a/span[@class=\'yazarad12pnt\']/text()')
 		l.add_xpath('publisher','//h3[@class=\'kapakyazisi\']/b/font/a/text()')
-		l.add_xpath('price',    '//span[@class=\'kapakyazisi\']/font/b/text()', u'(.*) TL')
-		l.add_xpath('price',    '//span[@class=\'kapakyazisi\']/b/text()', u'(.*) TL')
+		l.add_xpath('price',    '//span[@class="kapakyazisi"]/font/b/text()', u'(.*) TL')
 		l.add_value('link', response.url)
 		l.add_value('store', 5)
 		return l.load_item()

diff --git a/crawler/crawler/spiders/pandora.py b/crawler/crawler/spiders/pandora.py
@@ -13,17 +13,17 @@ class PandoraSpider(CrawlSpider):
     start_urls = ['http://www.pandora.com.tr/']
 
     rules = (
-        Rule(SgmlLinkExtractor(allow=(r'/urun\.aspx\?id=',), unique=True), 'parse_item', follow=True),
+        Rule(SgmlLinkExtractor(allow=(r'/urun/.*',), deny_domains='beyoglu.pandora.com.tr', unique=True), 'parse_item', follow=True),
         Rule(SgmlLinkExtractor(allow=(r'/.*', ), unique=True)),
     )
 
     def parse_item(self, response):
 		l = XPathItemLoader(item=BookItem(), response=response)
-		l.add_xpath('name',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelAdi\']/text()')
-		l.add_xpath('isbn',     '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelIsbn\']/text()')
-		l.add_xpath('author',   '//span[@id=\'ctl00_ContentPlaceHolderMainOrta_LabelYazar\']/a/text()')
-		l.add_xpath('publisher','//a[@id=\'ctl00_ContentPlaceHolderMainOrta_HyperLinkYayinci\']/text()')
-		l.add_xpath('price',    '//span[@class=\'fiyat\']/text()', u'(.*) TL')
+		l.add_xpath('name',     '//span[@id="ContentPlaceHolderMainOrta_LabelAdi"]/text()')
+		l.add_xpath('isbn',     '//span[@id="ContentPlaceHolderMainOrta_LabelIsbn"]/text()')
+		l.add_xpath('author',   '//span[@id="ContentPlaceHolderMainOrta_LabelYazar"]/a/text()')
+		l.add_xpath('publisher','//a[@id="ContentPlaceHolderMainOrta_HyperLinkYayinci"]/text()')
+		l.add_xpath('price',    '//span[@id=\'ContentPlaceHolderMainOrta_LabelFiyat\']/span[@class=\'fiyat\']/text()', u'(.*) TL')
 		l.add_value('link', response.url)
 		l.add_value('store', 4)
 		return l.load_item()

diff --git a/crawler/createdb.py b/crawler/createdb.py
diff --git a/crawler/run.sh b/crawler/run.sh
@@ -0,0 +1,68 @@
+#!/bin/sh
+
+crawl_list="./scrapy-ctl.py list"
+crawl_exec="./scrapy-ctl.py crawl"
+pattern=".(com|net|gen|org)(.tr|)$"
+
+list=0
+
+while getopts "o:lh" optname
+do
+    case "$optname" in
+	o)
+	    echo "Warning: Only run a site"
+	    site=$OPTARG
+	    ;;
+	h)
+	    cat << EOF
+	    usage:
+		./run [-o, -h] [site name]
+	    -o: Optional parameter. Indicates that script should run only for a site instance.
+		If site is provided as parameter, script should exit after finishes its execution.
+
+	    -l: Optional parameter. Lists the available book sites.
+
+	    -h: Optional parameter. Prints this message.
+
+	    [site name]: Optional parameter. Script starts from given site. If -o is provided, 'run' command should exit after executing the site.
+			 If -o is not provided then, 'run' command should continue executing respectively.
+	    example:
+		./run -o idefix.com
+			Run idefix.com then exits.
+
+		./run -l
+			List the book sites.
+
+		./run
+			Run all of the book sites.
+
+EOF
+	    exit 0
+	    ;;
+	l)
+	    list=1
+	    ;;
+	:)
+	    echo "HOP $OPTARG"
+	    ;;
+	*)
+	    echo "Unknown error occured"
+	    ;;
+    esac
+done
+
+for line in $($crawl_list); do
+    if [[ $line =~ $pattern ]]; then
+	if [ -n "$site" ]; then
+	    if [ "$site" != "$line" ]; then
+		continue;
+	    fi
+	fi
+	echo "Book Site: '$line'"
+	if [ $list -eq 0 ]; then
+	    echo "Crawling started . . ."
+	    $($crawl_exec $line)
+	    echo "Crawling done"
+	fi
+    fi
+done
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Bookcrawler branch which works on google appengine.