1 #!/usr/bin/python 2 import urllib2 3 import re 4 5 # download a web file (.html) of url with given name 6 def downURL(url, filename): 7 try: 8 fp = urllib2.urlopen(url) 9 except:10 print 'download exception'11 return False12 op = open(filename, 'wb')13 while True:14 s = fp.read()15 if not s:16 break17 op.write(s)18 19 fp.close()20 op.close()21 return True22 23 # get urls in a web24 def getURLs(url):25 try:26 fp = urllib2.urlopen(url)27 except:28 print 'get url exception'29 return []30 pattern = re.compile('http://[\w\.]+')31 while True:32 s = fp.read()33 if not s:34 break35 urls = pattern.findall(s)36 fp.close()37 return urls38 39 # crawl web in one level40 def spider(startURL):41 urls = []42 urls.append(startURL)43 urllist = getURLs(startURL)44 for url in urllist:45 print url46 if urls.count(url) == 0:47 urls.append(url)48 i = 049 while True:50 if len(urls) <= 0:51 break52 else:53 url = urls.pop(0)54 i = i + 155 downURL(url, str(i) + '.html')56 return True57 58 # test59 spider('http://www.baidu.com')