简要说明:
我有一个 Scrapy 项目,它从 Yahoo! 获取股票数据。金融。为了使我的项目顺利进行,我需要确保库存已经存在了所需的时间。为此,我首先抓取 CAT(Caterpillar Inc. (CAT) -NYSE),获取该时间段内的收盘价金额,然后确保此后抓取的所有股票的收盘价金额与 CAT 相同,从而确保股票已公开交易所需的时间长度。
问题:
这一切都工作得很好,但我的问题是,在 scrapy 完成解析 CAT 之前,它开始抓取其他股票并解析它们。这会导致错误,就像之前我可以从 CAT 获取所需数量的收盘价一样,scrapy 正在尝试确定是否有任何其他股票具有与 CAT 相同数量的收盘价,而该数量尚不存在。
实际问题
如何强制 scrapy 在开始解析其他网址之前完成解析一个网址
我也尝试过:
def start_requests(self):
global start_time
yield Request('http://finance.yahoo.com/q?s=CAT', self.parse)
# Waits 4 seconds to allow CAT to finish crawling
if time.time() - start_time > 0.2:
for i in self.other_urls:
yield Request(i, self.parse)
但股票在other_urls
永远不会开始,因为 scrapy 永远不会回到def start_requests
检查时间是否在上面0.2
整个代码:
from scrapy.selector import Selector
from scrapy import Request
from scrapy.exceptions import CloseSpider
from sharpeparser.gen_settings import *
from decimal import Decimal
from scrapy.spider import Spider
from sharpeparser.items import SharpeparserItem
import numpy
import time
if data_intervals == "m":
required_amount_of_returns = 24
elif data_intervals == "w":
required_amount_of_returns = 100
else:
required_amount_of_returns =
counter = 1
start_time = time.time()
class DnotSpider(Spider):
# ---- >>> ENSURE YOU INDENT 1 ---- >>>
# =======================================
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/", "ca.finance.yahoo.com"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
other_urls = ['http://eoddata.com/stocklist/TSX.htm', 'http://eoddata.com/stocklist/TSX/B.htm', 'http://eoddata.com/stocklist/TSX/C.htm', 'http://eoddata.com/stocklist/TSX/D.htm', 'http://eoddata.com/stocklist/TSX/E.htm', 'http://eoddata.com/stocklist/TSX/F.htm', 'http://eoddata.com/stocklist/TSX/G.htm', 'http://eoddata.com/stocklist/TSX/H.htm', 'http://eoddata.com/stocklist/TSX/I.htm', 'http://eoddata.com/stocklist/TSX/J.htm', 'http://eoddata.com/stocklist/TSX/K.htm', 'http://eoddata.com/stocklist/TSX/L.htm', 'http://eoddata.com/stocklist/TSX/M.htm', 'http://eoddata.com/stocklist/TSX/N.htm', 'http://eoddata.com/stocklist/TSX/O.htm', 'http://eoddata.com/stocklist/TSX/P.htm', 'http://eoddata.com/stocklist/TSX/Q.htm', 'http://eoddata.com/stocklist/TSX/R.htm', 'http://eoddata.com/stocklist/TSX/S.htm', 'http://eoddata.com/stocklist/TSX/T.htm', 'http://eoddata.com/stocklist/TSX/U.htm', 'http://eoddata.com/stocklist/TSX/V.htm', 'http://eoddata.com/stocklist/TSX/W.htm', 'http://eoddata.com/stocklist/TSX/X.htm', 'http://eoddata.com/stocklist/TSX/Y.htm', 'http://eoddata.com/stocklist/TSX/Z.htm'
'http://eoddata.com/stocklist/NASDAQ/B.htm', 'http://eoddata.com/stocklist/NASDAQ/C.htm', 'http://eoddata.com/stocklist/NASDAQ/D.htm', 'http://eoddata.com/stocklist/NASDAQ/E.htm', 'http://eoddata.com/stocklist/NASDAQ/F.htm', 'http://eoddata.com/stocklist/NASDAQ/G.htm', 'http://eoddata.com/stocklist/NASDAQ/H.htm', 'http://eoddata.com/stocklist/NASDAQ/I.htm', 'http://eoddata.com/stocklist/NASDAQ/J.htm', 'http://eoddata.com/stocklist/NASDAQ/K.htm', 'http://eoddata.com/stocklist/NASDAQ/L.htm', 'http://eoddata.com/stocklist/NASDAQ/M.htm', 'http://eoddata.com/stocklist/NASDAQ/N.htm', 'http://eoddata.com/stocklist/NASDAQ/O.htm', 'http://eoddata.com/stocklist/NASDAQ/P.htm', 'http://eoddata.com/stocklist/NASDAQ/Q.htm', 'http://eoddata.com/stocklist/NASDAQ/R.htm', 'http://eoddata.com/stocklist/NASDAQ/S.htm', 'http://eoddata.com/stocklist/NASDAQ/T.htm', 'http://eoddata.com/stocklist/NASDAQ/U.htm', 'http://eoddata.com/stocklist/NASDAQ/V.htm', 'http://eoddata.com/stocklist/NASDAQ/W.htm', 'http://eoddata.com/stocklist/NASDAQ/X.htm', 'http://eoddata.com/stocklist/NASDAQ/Y.htm', 'http://eoddata.com/stocklist/NASDAQ/Z.htm',
'http://eoddata.com/stocklist/NYSE/B.htm', 'http://eoddata.com/stocklist/NYSE/C.htm', 'http://eoddata.com/stocklist/NYSE/D.htm', 'http://eoddata.com/stocklist/NYSE/E.htm', 'http://eoddata.com/stocklist/NYSE/F.htm', 'http://eoddata.com/stocklist/NYSE/G.htm', 'http://eoddata.com/stocklist/NYSE/H.htm', 'http://eoddata.com/stocklist/NYSE/I.htm', 'http://eoddata.com/stocklist/NYSE/J.htm', 'http://eoddata.com/stocklist/NYSE/K.htm', 'http://eoddata.com/stocklist/NYSE/L.htm', 'http://eoddata.com/stocklist/NYSE/M.htm', 'http://eoddata.com/stocklist/NYSE/N.htm', 'http://eoddata.com/stocklist/NYSE/O.htm', 'http://eoddata.com/stocklist/NYSE/P.htm', 'http://eoddata.com/stocklist/NYSE/Q.htm', 'http://eoddata.com/stocklist/NYSE/R.htm', 'http://eoddata.com/stocklist/NYSE/S.htm', 'http://eoddata.com/stocklist/NYSE/T.htm', 'http://eoddata.com/stocklist/NYSE/U.htm', 'http://eoddata.com/stocklist/NYSE/V.htm', 'http://eoddata.com/stocklist/NYSE/W.htm', 'http://eoddata.com/stocklist/NYSE/X.htm', 'http://eoddata.com/stocklist/NYSE/Y.htm', 'http://eoddata.com/stocklist/NYSE/Z.htm',
'http://eoddata.com/stocklist/HKEX/0.htm', 'http://eoddata.com/stocklist/HKEX/1.htm', 'http://eoddata.com/stocklist/HKEX/2.htm', 'http://eoddata.com/stocklist/HKEX/3.htm', 'http://eoddata.com/stocklist/HKEX/6.htm', 'http://eoddata.com/stocklist/HKEX/8.htm',
'http://eoddata.com/stocklist/LSE/0.htm', 'http://eoddata.com/stocklist/LSE/1.htm', 'http://eoddata.com/stocklist/LSE/2.htm', 'http://eoddata.com/stocklist/LSE/3.htm', 'http://eoddata.com/stocklist/LSE/4.htm', 'http://eoddata.com/stocklist/LSE/5.htm', 'http://eoddata.com/stocklist/LSE/6.htm', 'http://eoddata.com/stocklist/LSE/7.htm', 'http://eoddata.com/stocklist/LSE/8.htm', 'http://eoddata.com/stocklist/LSE/9.htm', 'http://eoddata.com/stocklist/LSE/A.htm', 'http://eoddata.com/stocklist/LSE/B.htm', 'http://eoddata.com/stocklist/LSE/C.htm', 'http://eoddata.com/stocklist/LSE/D.htm', 'http://eoddata.com/stocklist/LSE/E.htm', 'http://eoddata.com/stocklist/LSE/F.htm', 'http://eoddata.com/stocklist/LSE/G.htm', 'http://eoddata.com/stocklist/LSE/H.htm', 'http://eoddata.com/stocklist/LSE/I.htm', 'http://eoddata.com/stocklist/LSE/G.htm', 'http://eoddata.com/stocklist/LSE/K.htm', 'http://eoddata.com/stocklist/LSE/L.htm', 'http://eoddata.com/stocklist/LSE/M.htm', 'http://eoddata.com/stocklist/LSE/N.htm', 'http://eoddata.com/stocklist/LSE/O.htm', 'http://eoddata.com/stocklist/LSE/P.htm', 'http://eoddata.com/stocklist/LSE/Q.htm', 'http://eoddata.com/stocklist/LSE/R.htm', 'http://eoddata.com/stocklist/LSE/S.htm', 'http://eoddata.com/stocklist/LSE/T.htm', 'http://eoddata.com/stocklist/LSE/U.htm', 'http://eoddata.com/stocklist/LSE/V.htm', 'http://eoddata.com/stocklist/LSE/W.htm', 'http://eoddata.com/stocklist/LSE/X.htm', 'http://eoddata.com/stocklist/LSE/Y.htm', 'http://eoddata.com/stocklist/LSE/Z.htm',
'http://eoddata.com/stocklist/AMS/A.htm', 'http://eoddata.com/stocklist/AMS/B.htm', 'http://eoddata.com/stocklist/AMS/C.htm', 'http://eoddata.com/stocklist/AMS/D.htm', 'http://eoddata.com/stocklist/AMS/E.htm', 'http://eoddata.com/stocklist/AMS/F.htm', 'http://eoddata.com/stocklist/AMS/G.htm', 'http://eoddata.com/stocklist/AMS/H.htm', 'http://eoddata.com/stocklist/AMS/I.htm', 'http://eoddata.com/stocklist/AMS/J.htm', 'http://eoddata.com/stocklist/AMS/K.htm', 'http://eoddata.com/stocklist/AMS/L.htm', 'http://eoddata.com/stocklist/AMS/M.htm', 'http://eoddata.com/stocklist/AMS/N.htm', 'http://eoddata.com/stocklist/AMS/O.htm', 'http://eoddata.com/stocklist/AMS/P.htm', 'http://eoddata.com/stocklist/AMS/Q.htm', 'http://eoddata.com/stocklist/AMS/R.htm', 'http://eoddata.com/stocklist/AMS/S.htm', 'http://eoddata.com/stocklist/AMS/T.htm', 'http://eoddata.com/stocklist/AMS/U.htm', 'http://eoddata.com/stocklist/AMS/V.htm', 'http://eoddata.com/stocklist/AMS/W.htm', 'http://eoddata.com/stocklist/AMS/X.htm', 'http://eoddata.com/stocklist/AMS/Y.htm', 'http://eoddata.com/stocklist/AMS/Z.htm',
'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=A', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=B', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=C', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=D', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=E', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=F', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=G', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=H', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=I', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=J', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=K', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=L', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=M', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=N', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=O', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=P', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Q', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=R', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=S', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=T', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=U', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=V', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=W', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=X', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Y', 'https://ca.finance.yahoo.com/q/cp?s=%5EIXIC&alpha=Z',
'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=0', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=1', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=2', 'https://ca.finance.yahoo.com/q/cp?s=%5EHSI&alpha=3',
'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EN100&alpha=Z',
'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EFCHI&alpha=Z',
'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=A', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=B', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=C', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=D', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=E', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=F', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=G', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=H', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=I', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=J', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=K', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=L', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=M', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=N', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=O', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=P', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Q', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=R', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=S', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=T', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=U', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=V', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=W', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=X', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Y', 'http://finance.yahoo.com/q/cp?s=%5EAEX&alpha=Z']
def start_requests(self):
global start_time
yield Request('http://finance.yahoo.com/q?s=CAT', self.parse)
# Waits 4 seconds to allow CAT to finish crawling
if time.time() - start_time > 0.2:
for i in self.other_urls:
yield Request(i, self.parse)
def parse(self, response):
if "eoddata" in response.url:
companyList = response.xpath('//tr[@class="ro"]/td/a/text()').extract()
for company in companyList:
if "TSX" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.TO&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "LSE" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.L&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "HKEX" in response.url:
go = 'http://finance.yahoo.com/q/hp?s={0}.HK&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "AMS" in response.url:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.AS&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
else:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
elif "http://finance.yahoo.com/q?s=CAT" in response.url:
go = 'http://finance.yahoo.com/q/hp?s=CAT&a={0}&b={1}&c={2}&d={3}&e={4}&f={5}&g={6}'.format(beginning_month, beginning_day, beginning_year, ending_month, ending_day, ending_year, data_intervals)
yield Request(go, self.stocks1)
else:
rows = response.xpath('//table[@class="yfnc_tableout1"]//table/tr')[1:]
for row in rows:
company = row.xpath('.//td[1]/b/a/text()').extract()
go = 'http://finance.yahoo.com/q/hp?s={0}&a={1}&b={2}&c={3}&d={4}&e={5}&f={6}&g={7}'.format(company, beginning_day, beginning_month, beginning_year, ending_day, ending_month, ending_year, data_intervals)
yield Request(go, self.stocks1)
def stocks1(self, response):
current_page = response.url
print current_page
# If the link is not the same as the first page, ie. stocks1 is requested through stocks2, get the stock data from stocks2
if initial_ending not in current_page[-iel:]:
returns_pages = response.meta.get('returns_pages')
# Remove the last stock price from the stock list, because it is the same as the first on the new list
if not not returns_pages:
if len(returns_pages) > 2:
returns_pages = returns_pages[:-1]
else:
# Else, if the link does match that of the first page, create a new list becuase one does not exist yet
returns_pages = []
# This grabs the stock data from the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
print "stocks1"
print returns_pages
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to returns_pages
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And this is the first page:
if initial_ending in current_page[-iel:]:
#create necessary url for the 2nd page
next_page = current_page + "&z=66&y=66"
# If this is not the first page
else:
# This increases the end of the link by 66, thereby getting the next 66 results on for pages 2 and after
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# Then go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks2, meta={'returns_pages': returns_pages}, dont_filter=True)
# Else, if there is no 'Next Link'
else:
# Send the retuns to finalize.stock to be saved in the item
yield Request(current_page, callback=self.finalize_stock, meta={'returns_pages': returns_pages}, dont_filter=True)
def stocks2(self, response):
# Prints the link of the current url
current_page = response.url
print current_page
# Gets the returns from the previous page
returns_pages = response.meta.get('returns_pages')
# Removes the last return from the previous page because it will be a duplicate
returns_pages = returns_pages[:-1]
print "stocks2"
print returns_pages
# Gets all of the returns on the page
rows = response.xpath('//table[@class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
# And adds it to the previous returns
returns_pages.append(values)
except ValueError:
continue
except ValueError:
continue
print "after 2"
print returns_pages
# exp determines if there is a 'Next page' or not
exp = response.xpath('//td[@align="right"]/a[@rel="next"]').extract()
# If there is a 'Next Page':
if not not exp:
# And somehow, this is the first page (should never be true)
if initial_ending in current_page[-iel:]:
# Add necessary link to go to the second page
next_page = current_page + "&z=66&y=66"
print next_page, "66&y not in curr_page"
# Else, this is not the first page (should always be true)
else:
# add 66 to the last number on the preceeding link in order to access the second or later pages
u = int(current_page[-6:].split("=",1)[1])
o = len(str(u))
u += 66
next_page = current_page[:-o] + str(u)
print next_page, "66&y in curr_page"
# go back to self.stocks1 to get more data on the next page
yield Request(next_page, self.stocks1, meta={'returns_pages': returns_pages}, dont_filter=True)
else:
# If there is no "Next" link, send the retuns to finalize.stock to be saved in the item
yield Request(current_page, callback=self.finalize_stock, meta={'returns_pages': returns_pages}, dont_filter=True)
print "sending to finalize stock"
def finalize_stock(self,response):
current_page = response.url
print "====================="
print "finalize_stock called"
print current_page
print "====================="
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in unformatted_returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
RFR = 0.03
# Make sure list is exact length, otherwise rate_of_return will be inaccurate
# Returns has not been checked by pipeline yet, so small lists will be in the variable
if len(returns) > required_amount_of_returns:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = SharpeparserItem()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[@class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = unformatted_returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[@class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item