Scrapes scraperwiki.com and classic.scraperwiki.com
ScraperWiki
To download data sign in with GitHub
rows 10 / 16895
lang | date_scraped | code | id | name | status | author |
---|---|---|---|---|---|---|
lang
php
|
date_scraped
2010-11-24 13:37:28
|
code
<?php
$url = 'http://www.allbuffs.com/';
$html = scraperwiki::scrape($url);
print get_include_path() . "\n";
$dail = "Daíl";
$q = "Здрав".
"ствуй".
"те";
//print html_entity_decode($q, ENT_NOQUOTES, 'UTF-8')."\n";
//echo $dail[1] . "\n";
//echo $dail . "\n";
//echo "Daíl\n";
//echo strlen("Hello"). "\n";
//echo strlen($html) . "\n";
scraperwiki::save(Array('html'), Array('html' => $html));
echo $html;
echo "Hello\n";
?>
|
id
php-bug
|
name
PHP Bug
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:04
|
code
###############################################################################
# START HERE: Tutorial 3: More advanced scraping. Shows how to follow 'next'
# links from page to page: use functions, so you can call the same code
# repeatedly. SCROLL TO THE BOTTOM TO SEE THE START OF THE SCRAPER.
###############################################################################
import scraperwiki
from BeautifulSoup import BeautifulSoup
# define the order our columns are displayed in the datastore
scraperwiki.metadata.save('data_columns', ['Artist', 'Album', 'Released', 'Sales (m)'])
# scrape_table function: gets passed an individual page to scrape
def scrape_table(soup):
data_table = soup.find("table", { "class" : "data" })
rows = data_table.findAll("tr")
for row in rows:
# Set up our data record - we'll need it later
record = {}
table_cells = row.findAll("td")
if table_cells:
record['Artist'] = table_cells[0].text
record['Album'] = table_cells[1].text
record['Released'] = table_cells[2].text
record['Sales (m)'] = table_cells[4].text
# Print out the data we've gathered
print record, '------------'
# Finally, save the record to the datastore - 'Artist' is our unique key
scraperwiki.datastore.save(["Artist"], record)
# scrape_and_look_for_next_link function: calls the scrape_table
# function, then hunts for a 'next' link: if one is found, calls itself again
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
scrape_table(soup)
next_link = soup.find("a", { "class" : "next" })
print next_link
if next_link:
next_url = base_url + next_link['href']
print next_url
scrape_and_look_for_next_link(next_url)
# ---------------------------------------------------------------------------
# START HERE: define your starting URL - then
# call a function to scrape the first page in the series.
# ---------------------------------------------------------------------------
base_url = 'http://www.madingley.org/uploaded/'
starting_url = base_url + 'example_table_1.html'
scrape_and_look_for_next_link(starting_url)
|
id
ti
|
name
ti
|
status
|
author
|
lang
|
date_scraped
2011-02-07 17:20:26
|
code
import scraperwiki
import mechanize
import urllib, urlparse
import lxml.etree, lxml.html
import re
import urlparse
import datetime
#scraperwiki.cache(True)
def Main():
purl = 'http://www.courtsni.gov.uk/en-GB/Judicial+Decisions/'
root = lxml.html.parse(purl).getroot()
urls = [ a.get('href') for a in root.cssselect('ul.QuickNav li a') if re.match("Published", a.text) ]
for i, url in enumerate(urls):
if i >= 1: # skip all but the first
continue
print i, url
parsesession(url)
def cleanup(data):
assert data.get('Judge') == data.get('Author'), data
author = data.pop('Author')
judge = data.pop('Judge')
mjudge = re.match('(.*?)\s+([LCJ]+)$', judge)
if mjudge:
data['judgename'] = mjudge.group(1)
data['judgetitle'] = mjudge.group(2)
else:
data['judgename'] = judge
for dkey in ['Date Created', 'Date Modified', 'Date Issued' ]:
if data.get(dkey):
mdate = re.match('(\d\d)/(\d\d)/(\d\d\d\d)$', data.get(dkey))
data[dkey] = datetime.datetime(int(mdate.group(3)), int(mdate.group(2)), int(mdate.group(1)))
mdate = re.match('(\d+) (\w\w\w) (\d\d\d\d)', data.get('Date'))
#print mdate.groups()
imonth = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'].index(mdate.group(2))
data['Date'] = datetime.datetime(int(mdate.group(3)), imonth+1, int(mdate.group(1)))
def parsepage(url):
for i in range(3):
try:
root = lxml.html.parse(url).getroot()
break
except IOError:
pass
rows = root.cssselect('table.MetaGrid_Display tr')
data = { }
for row in rows:
td1, td2 = list(row)
data[td1.text.strip(': ')] = td2.text
a = root.cssselect('#MSCMSAttachmentCustom6_PresentationModeControlsContainer_presHyperLink')[0]
#print data.keys()
data['Judgment'] = urlparse.urljoin(url, a.get('href'))
return data
def parsecontentspage(response):
root = lxml.html.parse(response).getroot()
rows = root.cssselect('table#DynamicControlDataGrid1_MSCSDataListXML_Document_Dynamic1 tr')
headers = [ th[0].text for th in rows.pop(0) ]
assert headers == ['Date', 'Title', 'Identifier', 'Judge'], headers
numdocs = int(root.cssselect("span#DynamicControlDataGrid1_lblDocCount")[0].text)
lrt = ''
if len(list(rows[-1])) == 1:
lr = rows.pop()
lrt = lxml.etree.tostring(lr)
for tr in rows:
data = dict(zip(headers, [tr[0].text, tr[1][0].text, tr[2].text, tr[3].text]))
data['url'] = tr[1][0].get('href')
fdata = parsepage(data['url'])
assert data.get('Identifier') == fdata.get('Identifier'), (data, fdata)
assert data.get('Title') == fdata.get('Title'), (data, fdata)
data.update(fdata)
cleanup(data)
#print data
scraperwiki.datastore.save(unique_keys=['Identifier'], data=data, date=data.get('Date Issued'))
return lrt, numdocs, len(rows)
def parsesession(url):
br = mechanize.Browser()
response = br.open(url)
lrt, numdocs, count = parsecontentspage(response)
ipage = 2
while lrt:
plinks = re.findall("\"javascript:__doPostBack\('(.*?)','(.*?)'\)\">(\d+|\.\.\.)</a>", lrt)
lklook = dict([ (int(x[2]), x) for x in plinks if x[2] != '...' ])
nx = lklook.get(ipage)
if not nx:
if plinks and plinks[-1][2] == '...':
assert (ipage % 10) == 1, lklook
nx = plinks[-1]
else:
return
print "Page", ipage
br.select_form('DynamicTable')
br.form.set_all_readonly(False)
#print br.form.controls
br['__EVENTTARGET'] = nx[0]
br['__EVENTARGUMENT'] = nx[1]
#print br.form.controls[-1].name
br.form.find_control('DynamicControlDataGrid1:SearchButton').disabled=True
response = br.submit()
lrt, lnumdocs, lcount = parsecontentspage(response)
ipage += 1
assert numdocs == lnumdocs
count += lcount
assert count == lcount
Main()
|
id
courts-ni-judicial-decisions
|
name
Courts NI Judicial Decisions
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:28
|
code
import scraperwiki
from BeautifulSoup import BeautifulSoup
from urlparse import urljoin
def scrape(base_url):
soup = BeautifulSoup(scraperwiki.scrape(base_url))
main_table = soup.findAll('table')
assert len(main_table) == 1
main_table = main_table[0]
for row in main_table.findAll('tr'):
if row['class'] == 'table_header':
continue
server_info = {}
server_info['offline'] = 'offline' in row['class']
server_td = row.find('td', 'server')
server_info['software'] = server_td.find('img')['title']
server_div = server_td.find('div', 'tooltip_container')
server_info['name'] = server_div.find('a').string
if server_div.find('a').has_key('href'):
server_info['url'] = server_div.find('a')['href']
print server_info['name']
#product_td = product_row.findAll('td')
#product['type'] = ''.join(product_td[1].findAll(text=True))
#product_manufacturer = product_td[2].find('a')
#if product_td[2].find('a') is None:
# product_manufacturer = product_td[2]
#product['manufacturer'] = product_manufacturer.string
#product_line = product_td[3].find('a')
#product['url'] = urljoin(base_url, product_line['href'])
#product['model'] = product_line.string
#if 'title' in product_line:
# product['name'] = product_line['title']
scraperwiki.datastore.save(['name'], server_info)
scrape('http://www.jabberes.org/servers/servers.html')
|
id
jabberxmpp-server-list
|
name
Jabber servers
|
status
|
author
|
lang
python
|
date_scraped
2010-11-24 13:44:05
|
code
###############################################################################
# Basic scraper
###############################################################################
# Blank Python
import re
import scraperwiki
from BeautifulSoup import BeautifulSoup
#define the order our columns are displayed in the datastore
scraperwiki.metadata.save('users', ['Count','Name','Private','URL','Friends','Interests'])
#scrape the fan section
def scrape_personal(vcard):
#setup the data record
record={}
record['Count']=count
record['Name'] = vcard.h1.text
p=re.compile('.register.')
s=vcard.h1.a['href']
m=p.search(s)
if m:
record['Private'] = "Y"
else:
record['Private'] = "N"
record['URL']=s
scraperwiki.datastore.save(["Count"], record)
def scrape_interests(info):
#setup the data record
record={}
record['Count']=count
friends_row = info.find("div",{"class":"UIGridRenderer_Row clearfix"})
if friends_row:
s=""
friends=friends_row.findAll("div",{"class":"UIPortrait_Text"})
for friend in friends:
s+=friend.text+":"
record['Friends'] =s
public_listing=info.find("div",{"id":"public_listing_pages"})
if public_listing:
l=""
likes=public_listing.findAll("th")
things=public_listing.findAll("a",{"class":"psl"})
for like in likes:
l+="|"+like.text+"|"
for psl in things:
l+=psl.text+":"
record['Interests']= l
scraperwiki.datastore.save(["Count"], record)
def find_page(url):
global count
check = False
try:
html = scraperwiki.scrape(url)
check=True
except:
print "Unable to retrieve page"
check=False
#continue
if check:
soup = BeautifulSoup(html)
vcard = soup.find("div",{"class":"vcard"})
info = soup.find("div",{"class":"info_column"})
if vcard or info:
scrape_personal(vcard)
scrape_interests(info)
#print info
count+=1
else:
directory=soup.find("div",{ "class" : "clearfix" }) #find the directory of links
UIDirectoryBox=directory.findAll("ul",{"class":"UIDirectoryBox_List"}) #find all the directoryboxes
for UI in UIDirectoryBox:
links = UI.findAll("a")
for link in links:
#print link['href']
urls=link['href']
find_page(urls)
def scrape_page(url):
html = scraperwiki.scrape(url) #get the landing page
soup = BeautifulSoup(html)
link_table=soup.findAll(attrs={'class':re.compile("alphabet_list clearfix")})
for link in link_table: #get each link section
next_urls=link.findAll("a")
for urls in next_urls:
#print urls['href'] #debug to check the urls
url=urls['href'] #get the href
print url
find_page(url)
#setup the base url
base_url = 'http://www.facebook.com/directory/people/'
#set the counter
count=0
#call the scraping function
scrape_page(base_url)
#find_page('http://en-gb.facebook.com/people/Abdullah-Bin-A/867565175')
|
id
public-facebook-records
|
name
Public facebook records
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:37
|
code
#################################################################
# BBC Weather Scraper
#################################################################
import scraperwiki
from BeautifulSoup import BeautifulSoup
# URL for the region Hull, East Riding of Yorkshire
html = scraperwiki.scrape ('http://news.bbc.co.uk/weather/forecast/336?printco=Forecast')
print html
soup = BeautifulSoup(html)
days = soup.findAll('tr')
for day in days:
if day['class'].find('day') == -1:
continue
record = {
'day': None,
'summary': None,
'temp_max_c': None,
'temp_min_c': None,
# 'windspeeddir': None,
# 'humpresvis': None,
}
tds = day.findAll('td')
for abbr in tds[0].findAll('abbr'):
record['day'] = abbr.text
for span in tds[2].findAll('span'):
try:
if span['class'].find('temp max') != -1:
record['temp_max_c'] = span.findAll('span',{'class':'cent'})[0].text[:-6]
except:
pass
for span in tds[3].findAll('span'):
try:
if span['class'].find('temp min') != -1:
record['temp_min_c'] = span.findAll('span',{'class':'cent'})[0].text[:-6]
except:
pass
# Windpreed & Direction
# for span in tds[4].findAll('span'):
# try:
# if span['class'].find('wind') != -1:
# record['windspeeddir'] = span.findAll('span',{'class':'mph'})[0].text[:-6]
# except:
# pass
# Humidity Pressure Visibility
# for span in tds[5].findAll('span'):
# try:
# if span['class'].find('humpresvis') != -1:
# record['humpresvis'] = span.findAll('span',{'class':'hum'})[0].text[:-6]
# except:
# pass
record['summary'] = day.findAll('div',{'class':'summary'})[0].findAll('strong')[0].text
print
scraperwiki.datastore.save(["day"], record)
|
id
bbc-weather-5-day-forecast-for-hull-uk
|
name
BBC Weather 5 Day Forecast for Hull, UK
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:40
|
code
import scraperwiki
from BeautifulSoup import BeautifulSoup
from urlparse import urljoin
def scrape():
base_url = 'http://auto-hifi.ru/price.php'
html = scraperwiki.scrape(base_url)
soup = BeautifulSoup(html)
div_contentpanel = soup.findAll('td', 'contentpanel')
assert len(div_contentpanel) == 1
for product_row in soup.findAll('tr', {'align': 'center'})[1:]:
product = {}
product_td = product_row.findAll('td')
product['type'] = ''.join(product_td[1].findAll(text=True))
product_manufacturer = product_td[2].find('a')
if product_td[2].find('a') is None:
product_manufacturer = product_td[2]
product['manufacturer'] = product_manufacturer.string
product_line = product_td[3].find('a')
product['url'] = urljoin(base_url, product_line['href'])
product['model'] = product_line.string
if 'title' in product_line:
product['name'] = product_line['title']
scraperwiki.datastore.save(['url'], product)
scrape()
|
id
auto-hifiru
|
name
auto-hifi.ru
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:41
|
code
##############################################################################
# Basic scraper
###############################################################################
import scraperwiki
from BeautifulSoup import BeautifulSoup
"""
soupselect.py
CSS selector support for BeautifulSoup.
soup = BeautifulSoup('<html>...')
select(soup, 'div')
- returns a list of div elements
select(soup, 'div#main ul a')
- returns a list of links inside a ul inside div#main
"""
import re
tag_re = re.compile('^[a-z0-9]+$')
attribselect_re = re.compile(
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
r'=?"?(?P<value>[^\]"]*)"?\]$'
)
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
# Tag
def attribute_checker(operator, attribute, value=''):
"""
Takes an operator, attribute and optional value; returns a function that
will return True for elements that match that combination.
"""
return {
'=': lambda el: el.get(attribute) == value,
# attribute includes value as one of a set of space separated tokens
'~': lambda el: value in el.get(attribute, '').split(),
# attribute starts with value
'^': lambda el: el.get(attribute, '').startswith(value),
# attribute ends with value
'$': lambda el: el.get(attribute, '').endswith(value),
# attribute contains value
'*': lambda el: value in el.get(attribute, ''),
# attribute is either exactly value or starts with value-
'|': lambda el: el.get(attribute, '') == value \
or el.get(attribute, '').startswith('%s-' % value),
}.get(operator, lambda el: el.has_key(attribute))
def select(soup, selector):
"""
soup should be a BeautifulSoup instance; selector is a CSS selector
specifying the elements you want to retrieve.
"""
tokens = selector.split()
current_context = [soup]
for token in tokens:
m = attribselect_re.match(token)
if m:
# Attribute selector
tag, attribute, operator, value = m.groups()
if not tag:
tag = True
checker = attribute_checker(operator, attribute, value)
found = []
for context in current_context:
found.extend([el for el in context.findAll(tag) if checker(el)])
current_context = found
continue
if '#' in token:
# ID selector
tag, id = token.split('#', 1)
if not tag:
tag = True
el = current_context[0].find(tag, {'id': id})
if not el:
return [] # No match
current_context = [el]
continue
if '.' in token:
# Class selector
tag, klass = token.split('.', 1)
if not tag:
tag = True
found = []
for context in current_context:
found.extend(
context.findAll(tag,
{'class': lambda attr: attr and klass in attr.split()}
)
)
current_context = found
continue
if token == '*':
# Star selector
found = []
for context in current_context:
found.extend(context.findAll(True))
current_context = found
continue
# Here we should just have a regular tag
if not tag_re.match(token):
return []
found = []
for context in current_context:
found.extend(context.findAll(token))
current_context = found
return current_context
def monkeypatch(BeautifulSoupClass=None):
"""
If you don't explicitly state the class to patch, defaults to the most
common import location for BeautifulSoup.
"""
if not BeautifulSoupClass:
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
BeautifulSoupClass.findSelect = select
def unmonkeypatch(BeautifulSoupClass=None):
if not BeautifulSoupClass:
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
delattr(BeautifulSoupClass, 'findSelect')
# retrieve a page
monkeypatch(BeautifulSoup)
starting_url = 'http://republicanwhip.house.gov/Newsroom/va7news.html'
html = scraperwiki.scrape(starting_url)
soup = BeautifulSoup(html)
re_href = re.compile(r'href="(.*?)"', re.I)
def strip_tags(x):
r = re.compile(r'<[^>]*>')
return r.sub('',x)
statements = {}
# use BeautifulSoup to get all <tr> tags
trs = select(soup, 'tr.style2')
for tr in trs:
links = select(tr, 'td a')
if len(links)==2:
date = strip_tags(str(links[0])).strip()
title = strip_tags(str(links[1])).strip()
href = re_href.search(str(links[0])).group(1).strip()
try:
release = scraperwiki.scrape(href)
except:
continue
b2 = BeautifulSoup(release)
body = select(b2, 'div.asset-body div.style1')
if not statements.has_key(date):
statements[date] = {}
statements[date][title] = str(body)
record = {'date': date, 'title': title, 'text': body}
scraperwiki.datastore.save(['date', 'title'], record)
|
id
rep-eric-cantors-press-releases
|
name
Rep. Eric Cantor's Press Releases
|
status
|
author
|
lang
python
|
date_scraped
2011-02-07 17:20:42
|
code
import scraperwiki
import re
from BeautifulSoup import BeautifulSoup
# The URLs we're going to scrape:
url = "http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_A1.html"
html = scraperwiki.scrape(url)
# http://www.taoiseach.gov.ie/eng/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_A1.html SECTION A - Lists 23 Bills which the Government expect to publish from the start of the Dáil session up to the beginning of the next session
# http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_B1.html SECTION B - Lists 13 Bills in respect of which Heads of Bills have been approved by Government and of which texts are being prepared
# http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_C11.html SECTION C - Lists 55 Bills in respect of which heads have yet to be approved by Government
# http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_D1.html SECTION D - Lists 25 Bills which are currently before the Dáil or Seanad
# http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_E1.html SECTION E - Lists 113 Bills which were enacted since the Government came to office on 14th June, 2007
# http://www.taoiseach.gov.ie/eng/Taoiseach_and_Government/Government_Legislation_Programme/SECTION_F1.html SECTION F - Lists 121 Bills which were published since the Government came to office on 14th June, 2007
# This is a useful helper function that you might want to steal.
# It cleans up the data a bit.
# def gettext(html):
# """Return the text within html, removing any HTML tags i
# cleaned = re.sub('<.*?>', '', html) # remove tags
# cleaned = ' '.join(cleaned.split()) # collapse whitespace
# return cleaned
#text = scraperwiki.scrape(url)
soup = BeautifulSoup(html)
#rows = re.findall('(?si)<tr[^>]*>(.*?)</tr>', text)
# <td valign="top">Forestry Bill</td><td valign="top">To reform and update the legislative framework relating to forestry in order to support the development of a modern forestry sector, which enshrines #the principles of sustainable forest management and protection of the environment<br /><br /></td></tr><tr>
#for row in rows:
trs = soup.findAll('tr')
for tr in trs:
if tr.find(colspan="3"):
continue
elif tr.contents[1].contents[0]==" Name Of Company ":
continue
else:
number, bill, desc = tr.contents[0].contents, tr.contents[1].contents, tr.contents[2].contents
#dept = re.search('<td colspan="3"><strong>(.*?)</strong></td>', row)
#if dept:
# deptb = dept
# deptb, number, bill, desc = None, None, None, None
#print deptb, number, bill, desc
data = {'number': number, 'bill': bill, 'desc': desc } #'deptb':deptb,
scraperwiki.datastore.save(['number'], data)
|
id
legislation-status
|
name
Irish Government: Legislation status
|
status
|
author
|
lang
ruby
|
date_scraped
2011-02-07 17:20:42
|
code
# Welcome to the second ScraperWiki Ruby tutorial
# At the end of the last tutorial we had downloaded the text of
# a webpage. We will do that again ready to process the contents
# of the page
html = ScraperWiki.scrape("http://www.stadtbranchenbuch.com/muenster/P/338.html")
puts html
# Next we use Nokogiri to extract the values from the HTML source.
# Uncomment the next five lines (i.e. delete the # at the start of the lines)
# and run the scraper again. There should be output in the console.
require 'nokogiri'
doc = Nokogiri::HTML(html)
# Then we can store this data in the datastore. Uncomment the following three lines and run
# the scraper again.
doc.search('.rLiTop').each do |td|
data = td.css("a").first.inner_html.split("<br>").map(&:'strip')
#name= td.css("address").first.inner_html.split("</a>").map(&:'strip')
#td.css("a").inner_html
something = "#{data},#{name}"
ScraperWiki.save(['data'], {'data' => something})
end
# Check the 'Data' tab - here you'll see the data saved in the ScraperWiki store.
|
id
stadtbranchenbuch
|
name
stadtbranchenbuch
|
status
|
author
|
Total run time: less than 20 seconds
Total cpu time used: less than 5 seconds
Total disk space used: 66.2 MB