<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
	>

<channel>
	<title>ScraperWiki Data Blog</title>
	<atom:link href="http://blog.scraperwiki.com/feed/" rel="self" type="application/rss+xml" />
	<link>http://blog.scraperwiki.com</link>
	<description>A blog about ScraperWiki and all things data</description>
	<lastBuildDate>Sun, 27 May 2012 07:49:38 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.com/</generator>
<cloud domain='blog.scraperwiki.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
		<url>http://s2.wp.com/i/buttonw-com.png</url>
		<title>ScraperWiki Data Blog</title>
		<link>http://blog.scraperwiki.com</link>
	</image>
	<atom:link rel="search" type="application/opensearchdescription+xml" href="http://blog.scraperwiki.com/osd.xml" title="ScraperWiki Data Blog" />
	<atom:link rel='hub' href='http://blog.scraperwiki.com/?pushpress=hub'/>
		<item>
		<title>Microfinance Data Scraping</title>
		<link>http://blog.scraperwiki.com/2012/05/27/microfinance-data-scraping/</link>
		<comments>http://blog.scraperwiki.com/2012/05/27/microfinance-data-scraping/#comments</comments>
		<pubDate>Sun, 27 May 2012 07:49:35 +0000</pubDate>
		<dc:creator>Thomas Levine</dc:creator>
				<category><![CDATA[opendata]]></category>
		<category><![CDATA[research]]></category>
		<category><![CDATA[Scrapers]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216948</guid>
		<description><![CDATA[I went to the Datakind&#8216;s New York Datadive last November and met the Microfinance Information Exchange (MIX), a group that &#8216;delivers data services, analysis, research and business information on the institutions that provide financial services to the world’s poor&#8217;. They wanted to see whether web-scraping could &#8230; <a href="http://blog.scraperwiki.com/2012/05/27/microfinance-data-scraping/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216948&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>I went to the <a href="http://datakind.org/">Datakind</a>&#8216;s <a href="http://datakind.org/york-city-datadive/">New York Datadive</a> last November and met the <a href="http://themix.org">Microfinance Information Exchange</a> (MIX), a group that &#8216;delivers data services, analysis, research and business information on the institutions that provide financial services to the world’s poor&#8217;. They wanted to see whether web-scraping could save them from manually gathering data. So fellow divers and I showed MIX the utility of web-scraping. Over the course of a day, about six people scraped data about microfinance institutions from a bunch of websites, saving MIX an estimated year of manual data entry.</p>
<p>Over the past few months, I worked further with MIX to study who has access to what sorts of financial services. <a href="http://datakind.org">DataKind</a> just put up our <a href="http://datakind.org/2012/05/642/">blog post about the project</a>. Read the post, or just look at the <a href="http://southafrica.mixmarket.org/">map</a> and explore the data.</p>
<p><a href="http://southafrica.mixmarket.org"><img src="https://github.com/tlevine/thomaslevine.com/raw/master/blog/mix-data-scraping/map.png" alt="Screenshot of the interactive map displaying the scraped data" width="100%" /></a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216948/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216948/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216948/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216948&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/05/27/microfinance-data-scraping/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/fb20594dd79dd9ab8d26c4bfd8f54fa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thomaslevine</media:title>
		</media:content>

		<media:content url="https://github.com/tlevine/thomaslevine.com/raw/master/blog/mix-data-scraping/map.png" medium="image">
			<media:title type="html">Screenshot of the interactive map displaying the scraped data</media:title>
		</media:content>
	</item>
		<item>
		<title>5 yr old goes &#8216;potty&#8217; at Devon and Somerset Service (Emergencies and Data Driven Stories)</title>
		<link>http://blog.scraperwiki.com/2012/05/25/5-yr-old-goes-potty/</link>
		<comments>http://blog.scraperwiki.com/2012/05/25/5-yr-old-goes-potty/#comments</comments>
		<pubDate>Fri, 25 May 2012 07:13:33 +0000</pubDate>
		<dc:creator>Julian Todd</dc:creator>
				<category><![CDATA[opendata]]></category>
		<category><![CDATA[Scrapers]]></category>
		<category><![CDATA[data]]></category>
		<category><![CDATA[html]]></category>
		<category><![CDATA[javascript]]></category>
		<category><![CDATA[open data]]></category>
		<category><![CDATA[scraper]]></category>
		<category><![CDATA[scrapers]]></category>
		<category><![CDATA[views]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216901</guid>
		<description><![CDATA[It&#8217;s 9:54am in Torquay on a Wednesday morning: One appliance from Torquays fire station was mobilised to reports of a child with a potty seat stuck on its head. On arrival an undistressed two year old female was discovered with a toilet &#8230; <a href="http://blog.scraperwiki.com/2012/05/25/5-yr-old-goes-potty/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216901&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<blockquote><p><strong>It&#8217;s <a href="http://www.dsfire.gov.uk/News/Newsdesk/IncidentDetail.cfm?IncidentID=13354&amp;siteCategoryId=3&amp;T1ID=26&amp;T2ID=41">9:54am</a> in Torquay on a Wednesday morning:</strong></p>
<p>One appliance from Torquays fire station was mobilised to reports of a child with a potty seat stuck on its head.</p>
<p>On arrival an undistressed two year old female was discovered with a toilet seat stuck on her head.</p>
<p>Crews used vaseline and the finger kit to remove the seat from the childs head to leave her uninjured.</p></blockquote>
<p>A couple of different interests directed me to scrape the <a href="http://www.dsfire.gov.uk/News/Newsdesk/IncidentsPast7days.cfm?siteCategoryId=3&amp;T1ID=26&amp;T2ID=35">latest incidents</a> of the <a href="http://www.dsfire.gov.uk/index.cfm?siteCategoryId=1">Devon and Somerset Fire and Rescue Service</a>. The scraper that has collected the data is <strong><a href="https://scraperwiki.com/scrapers/somerset_fire_incidents/">here</a></strong>.</p>
<p><strong>Why does this matter?</strong></p>
<p>Everybody loves their public safety workers &#8212; Police, Fire, and Ambulance. They save lives, give comfort, and are there when things get out of hand.</p>
<p>Where is the standardized performance data for these incident response workers?<strong> </strong>Real-time and rich data would revolutionize its governance and administration, would give real evidence of whether there are too many or too few police, fire or ambulance personnel/vehicles/stations in any locale, or would enable the implementation of imaginative and realistic policies resulting from major efficiency and resilience improvements all through the system?</p>
<p>For those of you who want to skip all the background discussion, just head directly over to the <strong><a href="https://views.scraperwiki.com/run/somerset_fire_day_rose/">visualization</a></strong>.</p>
<p><a href="https://views.scraperwiki.com/run/somerset_fire_day_rose/"><img class="aligncenter size-full wp-image-758216924" style="border:1px solid #ccc;" title="Devon and Somerset Fire Service visualisation" src="http://scraperwiki.files.wordpress.com/2012/05/fire.png?w=640&h=298" alt="A rose diagram showing incidents handled by the Devon and Somerset Fire Service" width="640" height="298" /></a></p>
<p>The easiest method to monitor the needs of the organizations is to see how much work each employee is doing, and add more or take away staff depending on their workloads. The problem is, for an <em>emergency</em> service that exists on standby for unforeseen events, there needs to be a level of idle capacity in the system. Also, there will be a degree of unproductive make-work in any organization &#8212; Indeed, a lot of form filling currently happens around the place, despite there being no accessible data at the end of it.</p>
<p>The second easiest method of oversight is to compare one area with another. I have an example from <a href="http://www.californiacityfinance.com/#SPENDING">California City Finance</a> where the Excel spreadsheet of <a href="http://www.californiacityfinance.com/Fire09p.xlsx">Fire Spending By city</a> even has a breakdown of the spending per capita and as a percentage of the total city budget. The city to look at is <a href="http://en.wikipedia.org/wiki/Vallejo,_California">Vallejo</a> which entered bankruptcy in 2008. Many of its citizens blamed this on the exorbitant salaries and benefits of its firefighters and police officers. I can&#8217;t quite see it in this data, and the <a href="http://www.vanityfair.com/business/features/2011/11/michael-lewis-201111">story journalism</a> on it doesn&#8217;t provide an unequivocal picture.</p>
<p>The best method for determining the efficient and robust provision of such services is to have an accurate and comprehensive computer model on which to run simulations of the business and experiment with different strategies. This is what Tesco or Walmart or any large corporation would do in order to drive up its efficiency and monitor and deal with threats to its business. There is bound to be a dashboard in Tesco HQ monitoring the distribution of full fat milk across the country, and they would know to three decimal places what percentage of the product was being poured down the drain because it got past its sell-by date, and, conversely, whenever too little of the substance had been delivered such that stocks ran out. They would use the data to work out what circumstances caused changes in demand. For example, school holidays.</p>
<p>I have surveyed many of the documents within the <a href="http://www.dsfire.gov.uk/FireAuthority/MeetingsArchive.cfm?siteCategoryId=10&amp;T1ID=83">Devon &amp; Somerset Fire &amp; Rescue Authority</a> website, and have come up with no evidence of such data or its analysis anywhere within the organization. This is quite a surprise, and perhaps I haven&#8217;t looked hard enough, because the documents are extremely boring and strikingly irrelevant.</p>
<h2>Under the hood – how it all works</h2>
<p><a href="https://scraperwiki.com/scrapers/somerset_fire_incidents/" target="_blank">The scraper itself</a> has gone through several iterations. It currently operates through three functions: MainIndex(), MainDetails(), MainParse(). Data for each incident is put into several tables joined by the <strong>IncidentID</strong> value derived from the incident&#8217;s static url, eg:</p>
<blockquote>
<pre><a href="http://www.dsfire.gov.uk/News/Newsdesk/IncidentDetail.cfm?IncidentID=7901&amp;siteCategoryId=3&amp;T1ID=26&amp;T2ID=41">http://www.dsfire.gov.uk/News/Newsdesk/IncidentDetail.cfm?IncidentID=7901&amp;siteCategoryId=3&amp;T1ID=26&amp;T2ID=41</a></pre>
</blockquote>
<p><strong>MainIndex()</strong> operates their <a href="http://www.dsfire.gov.uk/News/Newsdesk/SearchIncidents.cfm?siteCategoryId=3&amp;T1ID=26&amp;T2ID=41">search incidents form</a> grabbing 10 days at a time and saving URLs for each individual incident page into the table <em>swdata</em>.</p>
<p><strong>MainDetails()</strong> downloads each of those incident pages, parsing the obvious metadata, and saving the remaining HTML content of the description into the database. (This used to attempt to parse the text, but I then had to move it into the third function so I could develop it more easily.) A good way to find the list of urls that have not been downloaded and saved into the <em>swdetails</em> is to use the following SQL statement:</p>
<blockquote>
<pre><strong>select</strong> swdata.IncidentID, swdata.urlpage 
<strong>from</strong> swdata 
<strong>left join</strong> swdetails <strong>on</strong> swdetails.IncidentID=swdata.IncidentID 
<strong>where</strong> swdetails.IncidentID is null 
<strong>limit</strong> 5</pre>
</blockquote>
<p>We then download the HTML from each of the five urlpages, save it into the table under the column <em>divdetails</em> and repeat until no more unmatched records are retrieved.</p>
<p><strong>MainParse()</strong> performs the same progressive operation on the HTML contents of <em>divdetails</em>, saving it into the the table <em>swparse</em>. Because I was developing this function experimentally to see how much information I could obtain from the free-form text, I had to frequently drop and recreate enough of the table for the join command to work:</p>
<blockquote>
<pre>scraperwiki.sqlite.execute("drop table if exists swparse")
scraperwiki.sqlite.execute("create table if not exists swparse (IncidentID text)")</pre>
</blockquote>
<p>After marking the text down (by replacing the <em>&lt;p&gt;</em> tags with linefeeds), we have text that reads <a href="http://www.dsfire.gov.uk/News/Newsdesk/IncidentDetail.cfm?IncidentID=10421&amp;siteCategoryId=3&amp;T1ID=26&amp;T2ID=41">like this</a> (emphasis added):</p>
<blockquote><p><strong>One appliance from Holsworthy</strong> was mobilised to reports of a motorbike on fire. <strong>Crew Commander Squirrell</strong> was in charge.</p>
<p>On arrival one motorbike was discovered well alight. One hose reel was used to extinguish the fire. The police were also in attendance at this incident.</p></blockquote>
<p>We can get who is in charge and what their rank is using this regular expression:</p>
<blockquote>
<pre>re.findall("(crew|watch|station|group|incident|area)\s+(commander|manager)\s*([\w\-]+)(?i)", details)</pre>
</blockquote>
<p>You can see the whole table <a href="https://api.scraperwiki.com/api/1.0/datastore/sqlite?format=htmltable&amp;name=somerset_fire_incidents&amp;query=select%20count(*)%20as%20number%2C%20inchargetype1%2C%20inchargetype2%2C%20inchargename%20from%20swparse%20where%20inchargename%20is%20not%20null%20group%20by%20inchargename%0A">here</a> including silly names, misspellings, and clear flaws within my regular expression such as not being able to handle the case of a first name and a last name being included. (The personnel misspellings suggest that <em>either</em> these incident reports are not integrated with their actual incident logs where you would expect persons to be identified with their codenumbers, <em>or</em> their record keeping is terrible.)</p>
<p>For detecting how many vehicles were in attenence, I used this algorithm:</p>
<blockquote>
<pre>appliances = re.findall("(\S+) (?:(fire|rescue) )?(appliances?|engines?|tenders?|vehicles?)(?: from ([A-Za-z]+))?(?i)", details)
nvehicles = 0
for scount, fire, engine, town in lappliances:
    if town and "town" not in data:
        data["town"] = town.lower(); 
    if re.match("one|1|an?|another(?i)", scount):  count = 1
    elif re.match("two|2(?i)", scount):            count = 2
    elif re.match("three(?i)", scount):            count = 3
    elif re.match("four(?i)", scount):             count = 4
    else:                                          count = 0
    nvehicles += count</pre>
</blockquote>
<h2>And now onto <a href="https://views.scraperwiki.com/run/somerset_fire_day_rose/">the visualization</a>…</h2>
<p>It&#8217;s not good enough to have the data. You need to do something with it. See it and explore it.</p>
<p>For some reason I decided that I wanted to graph the hour of the day each incident took place, and produced <a href="https://views.scraperwiki.com/run/somerset_fire_day_rose/">this time rose</a>, which is a polar bar graph with one sector showing the number of incidents occurring each hour.</p>
<p>You can filter by the day of the week, the number of vehicles involved, the category, year, and fire station town. Then click on one of the sectors to see all the incidents for that hour, and click on an incident to read its description.</p>
<p>Now, if we matched our stations against the <a href="http://www.dsfire.gov.uk/YourArea/SomersetCommand/Stations.cfm?siteCategoryId=12&amp;T1ID=59&amp;T2ID=69">list of all stations</a>, and geolocated the incident locations using the <strong>Google Maps API</strong> (subject to not going <em>OVER_QUERY_LIMIT</em>), then we would be able to plot a map of how far the appliances were driving to respond to each incident. Even better, I could post the start and end locations into the Google Directions API, and get journey times and an idea of which roads and junctions are the most critical.</p>
<p>There&#8217;s more. What if we could identify when the response did not come from the closest station, because it was over capacity? What if we could test whether closing down or expanding one of the other stations would improve the performance in response to the database of times, places and severities of each incident? What if each journey time was logged to find where the road traffic bottlenecks are? How about cross-referencing the fire service logs for each incident with the equivalent logs held by the police and ambulance services, to identify the Total Response Cover for the whole incident – information that&#8217;s otherwise balkanized and duplicated among the three different historically independent services.</p>
<p>Sometimes it&#8217;s also enlightening to see what <em>doesn&#8217;t</em> appear in your datasets. In this case, one incident I was specifically looking for strangely doesn&#8217;t appear in these Devon and Somerset Fire logs: <a href="http://www.mendipcaverescue.org/index.php?option=com_content&amp;task=view&amp;id=50&amp;Itemid=48">On 17 March 2011</a> the Police, Fire and Ambulance were all mobilized in massive numbers towards Goatchurch Cavern – but the Mendip Cave Rescue service only heard about it via the <a href="http://www.ascrt.com/">Avon and Somerset Cliff Rescue</a>. Surprise surprise, the event&#8217;s missing from my Fire logs database. No one knows anything of what is going on. And while we&#8217;re at it, why are they separate organizations anyway?</p>
<p>Next up, someone else can do the Cornwall Fire and Rescue Service and see if they can get their <a href="https://db.cornwall.gov.uk/fbnews/info/search.asp">incident search form</a> to work.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216901/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216901/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216901/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216901&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/05/25/5-yr-old-goes-potty/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ae3cb03a98a6470bdf839dd84a226e47?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">goatchurch</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/05/fire.png" medium="image">
			<media:title type="html">Devon and Somerset Fire Service visualisation</media:title>
		</media:content>
	</item>
		<item>
		<title>Handling exceptions in scrapers</title>
		<link>http://blog.scraperwiki.com/2012/05/15/handling-exceptions-in-scrapers/</link>
		<comments>http://blog.scraperwiki.com/2012/05/15/handling-exceptions-in-scrapers/#comments</comments>
		<pubDate>Tue, 15 May 2012 18:48:41 +0000</pubDate>
		<dc:creator>Thomas Levine</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216842</guid>
		<description><![CDATA[When requesting and parsing data from a source with unknown properties and random behavior (in other words, scraping), I expect all kinds of bizarrities to occur. Managing exceptions is particularly helpful in such cases. Here is some ways that an &#8230; <a href="http://blog.scraperwiki.com/2012/05/15/handling-exceptions-in-scrapers/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216842&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>When requesting and parsing data from a source with unknown properties and random behavior (in other words, scraping), I expect all kinds of bizarrities to occur. Managing exceptions is particularly helpful in such cases.</p>
<p>Here is some ways that an exception might be raised.</p>
<pre><code>[][0] #The list has no zeroth element, so this raises an IndexError
{}['foo'] #The dictionary has no foo element, so this raises a KeyError
</code></pre>
<p>Catching the exception is sometimes cleaner than preventing it from happening in the first place. Here are some examples handling bizarre exceptions in scrapers.</p>
<h2>Example 1: Inconsistant date formats</h2>
<p>Let&#8217;s say we&#8217;re parsing dates.</p>
<pre><code>import datetime
</code></pre>
<p>This doesn&#8217;t raise an error.</p>
<pre><code>datetime.datetime.strptime('2012-04-19', '%Y-%m-%d')
</code></pre>
<p>But this does.</p>
<pre><code>datetime.datetime.strptime('April 19, 2012', '%Y-%m-%d')
</code></pre>
<p>It raises a <em>ValueError</em> because the date formats don&#8217;t match. So what do we do if we&#8217;re scraping a data source with multiple date formats?</p>
<h3>Ignoring unexpected date formats</h2>
<p>A simple thing is to ignore the date formats that we didn&#8217;t expect.</p>
<pre><code>import lxml.html
import datetime

def parse_date1(source):
    rawdate = lxml.html.fromstring(source).get_element_by_id('date').text

    try:
         cleandate = datetime.datetime.strptime(rawdate, '%Y-%m-%d')
    except ValueError:
         cleandate = None

    return cleandate

print parse_date1('&lt;div id="date"&gt;2012-04-19&lt;/div&gt;')
</code></pre>
<p>If we make a clean date column in a database and put this in there, we&#8217;ll have some rows with dates and some rows with nulls. If there are only a few nulls, we might just parse those by hand.</p>
<h3>Trying multiple date formats</h2>
<p>Maybe we have determined that this particular data source uses three different date formats. We can try all three.</p>
<pre><code>import lxml.html
import datetime

def parse_date2(source):
    rawdate = lxml.html.fromstring(source).get_element_by_id('date').text

    for date_format in ['%Y-%m-%d', '%B %d, %Y', '%d %B, %Y']:
        try:
             cleandate = datetime.datetime.strptime(rawdate, date_format)
             return cleandate
        except ValueError:
             pass

    return None

print parse_date2('&lt;div id="date"&gt;19 April, 2012&lt;/div&gt;')
</code></pre>
<p>This loops through three different date formats and returns the first one that doesn&#8217;t raise the error.</p>
<h2>Example 2: Unreliable HTTP connection</h2>
<p>If you&#8217;re scraping an unreliable website or you are behind an unreliable internet connection, you may sometimes get <a href="http://docs.python.org/library/urllib2.html#urllib2.HTTPError">HTTPError</a>s or URLErrors for valid URLs. Trying again later might help.</p>
<pre><code>import urllib2

def load(url):
    retries = 3
    for i in range(retries):
        try:
            handle = urllib2.urlopen(url)
            return handle.read()
        except urllib2.URLError:
            if i + 1 == retries:
                raise
            else:
                time.sleep(42)
    # never get here

print load('http://thomaslevine.com')
</code></pre>
<p>This function tries to download the page thee times. On the first two fails, it waits 42 seconds and tries again. On the third failure, it raises the error. On a success, it returs the content of the page.</p>
<h2>Example 3: Logging errors rather than raising them</h2>
<p>For more complicated parses, you might find loads of errors popping up in weird places, so you might want to go through all of the documents before deciding which to fix first or whether to do some of them manually.</p>
<pre><code>import scraperwiki

for document_name in document_names:
    try:
        parse_document(document_name)
    except Exception as e:
        scraperwiki.sqlite.save([], {
            'documentName': document_name,
            'exceptionType': str(type(e)),
            'exceptionMessage': str(e)
        }, 'errors')
</code></pre>
<p>This catches any exception raised by a particular document, stores it in the database and then continues with the next document. Looking at the database afterwards, you might notice some trends in the errors that you can easily fix and some others where you might hard-code the correct parse.</p>
<h2>Example 4: Exiting gracefully</h2>
<p>When I&#8217;m scraping over 9000 pages and my script fails on page 8765, I like to be able to resume where I left off. I can often figure out where I left off based on the previous row that I saved to a database or file, but sometimes I can&#8217;t, particularly when I don&#8217;t have a unique index.</p>
<pre><code>for bar in bars:
    try:
        foo(bar)
    except:
        print('Failure at bar = "%s"' % bar)
        raise
</code></pre>
<p>This will tell me which <em>bar</em> I left off on. It&#8217;s fancier if I save the information to the database, so here is how I might do that with ScraperWiki.</p>
<pre><code>import scraperwiki

resume_index = scraperwiki.sqlite.get_var('resume_index', 0)
for i, bar in enumerate(bars[resume_index:]):
    try:
        foo(bar)
    except:
        scraperwiki.sqlite.save_var('resume_index', i)
        raise
scraperwiki.sqlite.save_var('resume_index', 0)
</code></pre>
<p>ScraperWiki has a limit on CPU time, so an error that often concerns me is the <a href="https://scraperwiki.com/docs/python/python_help_documentation/">scraperwiki.CPUTimeExceededError</a>. This error is raised after the script has used 80 seconds of CPU time; if you catch the exception, you have two CPU seconds to clean up. You might want to handle this error differently from other errors.</p>
<pre><code>import scraperwiki

resume_index = scraperwiki.sqlite.get_var('resume_index', 0)
for i, bar in enumerate(bars[resume_index:]):
    try:
        foo(bar)
    except scraperwiki.CPUTimeExceededError:
        scraperwiki.sqlite.save_var('resume_index', i)
    except Exception as e:
        scraperwiki.sqlite.save_var('resume_index', i)
        scraperwiki.sqlite.save([], {
            'bar': bar,
            'exceptionType': str(type(e)),
            'exceptionMessage': str(e)
        }, 'errors')
scraperwiki.sqlite.save_var('resume_index', 0)
</code></pre>
<h2>tl;dr</h2>
<p>Expect exceptions to occur when you are scraping a randomly unreliable website with randomly inconsistent content, and consider handling them in ways that allow the script to keep running when one document of interest is bizarrely formatted or not available.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216842/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216842/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216842/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216842&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/05/15/handling-exceptions-in-scrapers/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/fb20594dd79dd9ab8d26c4bfd8f54fa7?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">thomaslevine</media:title>
		</media:content>
	</item>
		<item>
		<title>Announcing ScraperWiki Premium Accounts!</title>
		<link>http://blog.scraperwiki.com/2012/04/20/premium-accounts/</link>
		<comments>http://blog.scraperwiki.com/2012/04/20/premium-accounts/#comments</comments>
		<pubDate>Fri, 20 Apr 2012 15:56:10 +0000</pubDate>
		<dc:creator>Francis Irving</dc:creator>
				<category><![CDATA[business]]></category>
		<category><![CDATA[developer]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216875</guid>
		<description><![CDATA[The most exciting bit about ScraperWiki is how it forms a link between two very different worlds. On the one hand, we love the public good that data liberation enables, and we&#8217;re used by everyone from journalists (did you see &#8230; <a href="http://blog.scraperwiki.com/2012/04/20/premium-accounts/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216875&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://scraperwiki.files.wordpress.com/2012/04/open_for_business.jpg"><img class="alignright size-medium wp-image-758216885" title="Open For Business!" src="http://scraperwiki.files.wordpress.com/2012/04/open_for_business.jpg?w=224&h=300" alt="ScraperWiki digger in front of credit card payment logos" width="224" height="300" /></a>The most exciting bit about ScraperWiki is how it forms a link between two very different worlds.</p>
<p>On the one hand, we love the public good that data liberation enables, and we&#8217;re used by everyone from journalists (did you see us on the <a href="http://www.guardian.co.uk/politics/2012/apr/10/mps-lords-perks-revealed-parliament">Guardian front page</a> last week?) to activists (like the guys behind <a href="http://blog.scraperwiki.com/2011/03/21/scraperwiki-ing-down-under/">Australian planning alerts</a>).</p>
<p>But we also love the value that businesses create using data. They use ScraperWiki in many ways – like pulling customised marketing leads from the web, and extracting and cleaning old proprietary data so it can be sold anew – something we&#8217;ll be blogging about a lot more in the next few weeks.</p>
<p>Today, we&#8217;re really excited to announce that <em>anyone</em> (be they journalists, businesses or anything else!) can now use ScraperWiki in private with the click of a button. Our new <a href="https://scraperwiki.com/pricing/">premium accounts</a> range from $9 per month for individuals, to $299 for corporates with lots of collaborators – all you need is a credit card.</p>
<p>For that monthly fee you get to make ScraperWiki vaults (secure, private areas, which you can share with precisely who you want) and you also get the ability to schedule any scraper to run hourly (for data feeds that update more often than once a day).</p>
<p>This will let journalists keep their scrapers secret – embargoed until they write their story. It will let businesses scrape websites without revealing to their competitors the advantage they&#8217;ve found. It will let anyone scrape their own private data, <em>in private</em>, to repurpose it and do wonderful things that nobody had ever intended.</p>
<p>We&#8217;re quite excited to hear about what you do. Since vaults are private we won&#8217;t know, so please <a href="https://scraperwiki.com/contact/">get in touch</a>. We&#8217;d love to write about it here, if you&#8217;ll let us.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216875/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216875/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216875/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216875&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/04/20/premium-accounts/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/385f073a12b016d1a85c0fda88ce82d5?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">frabcus</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/04/open_for_business.jpg?w=224" medium="image">
			<media:title type="html">Open For Business!</media:title>
		</media:content>
	</item>
		<item>
		<title>Parsing panic</title>
		<link>http://blog.scraperwiki.com/2012/04/17/parsing-panic/</link>
		<comments>http://blog.scraperwiki.com/2012/04/17/parsing-panic/#comments</comments>
		<pubDate>Tue, 17 Apr 2012 12:20:26 +0000</pubDate>
		<dc:creator>Francis Irving</dc:creator>
				<category><![CDATA[journalism]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216635</guid>
		<description><![CDATA[This is a guest post by Martha Rotter, co-founder of Woop.ie and recently launched Irish technology magazine Idea. Hey remember the Wikipedia blackout? I do, because I was highly amused by the number of students panicking due to papers or homework &#8230; <a href="http://blog.scraperwiki.com/2012/04/17/parsing-panic/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216635&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><em><img class="alignright" src="https://secure.gravatar.com/avatar.php?size=125&amp;d=identicon&amp;gravatar_id=9c94c3a7dae3a482ff5f7db2c3619430" alt="" width="125" height="125" />This is a guest post by Martha Rotter, co-founder of Woop.ie and recently launched Irish technology magazine Idea.<br />
</em></p>
<p>Hey remember the Wikipedia blackout? I do, because I was highly amused by the number of students panicking due to papers or homework they seemingly could not complete without this one website.</p>
<p>One of my favourite things to do with ScraperWiki is to capture people’s reactions and sentiments, and then try to make predictions based on the data. I call it a “Zeitgeist Parse”, because I’m looking for the general public’s response to some event currently happening. Looking at the barrage of tweets coming from confused and frustrated students, I wondered could we predict an upcoming epidemic of bad grades or test results.</p>
<p><strong>PROCESS</strong></p>
<p>I built a few quick scrapers to grab tweets related to Wikipedia blackouts. The queries I used were “wikipedia AND paper” and “wikipedia AND homework”. I thought there might be slight variations in what people with homework were worried about versus maybe more detailed term papers or reports. You can see the Python code for them on <a href="https://scraperwiki.com/profiles/martharotter/">my Scraperwiki profile</a>.</p>
<p>After the results were stored, I wanted to do something very simple. I wanted to parse all of the records and get the words tweeted most frequently. From there, I could start to analyze the data more clearly and find patterns and trends.</p>
<p>One way to do this is to take the data &amp; use something like IBM’s <a href="http://www-958.ibm.com/software/data/cognos/manyeyes/">ManyEyes</a> to get a visualization of frequently used text. This is handy if you want a Tag Cloud or basic chart to view the results.</p>
<p><a href="http://scraperwiki.files.wordpress.com/2012/04/scraperwikiimg4.png"><img class="alignnone size-full wp-image-758216837" title="Twitter tag cloud" src="http://scraperwiki.files.wordpress.com/2012/04/scraperwikiimg4.png?w=640" alt=""   /></a></p>
<p>However I was conscious of the fact that with so many tweets, it could be easy to miss smaller but still significant trends. A really easy way to parse and sort text is by using Excel + VBA. Since ScraperWiki can export to CSV, I downloaded the CSV files &amp; wrote a small macro to walk through the words and count instances of each of them. After sorting the results, I had a fairly solid picture of the top words used by protesting tweeters.</p>
<p><strong>WHAT I DIDN’T FIND</strong></p>
<p>I actually did not find specific subjects. Hardly any comments about which course or paper was in danger due to the shutdown. Few worries about particular subjects, the notable exceptions being history, with 50 instances and English with 37 instances appearing in the data. For a moment, my experiment was basically a waste of time and processing power.</p>
<p><strong>WHAT I DID FIND</strong></p>
<p>But as I examined the results, what I actually found was slightly more interesting. After removing obvious words like Wikipedia and homework, I started to see a few recurring patterns in terms of type of language used.</p>
<p>The panic of the situation jumps out immediately. Words like GOTTA (I didn’t remove capitalization as it adds context in this scenario), fail, DOWN, NEED, TOMORROW, extension, justmyluck, screwed, and even HLP!!!!!!! appear in high numbers throughout the results.</p>
<p>Next I noticed the very emotional nature of the language. As expected, lots of swearing and foul language appears. But also high instances of things like hate, mad, suck, freaking, omfg, fixitnow, and of course WTF showed up in the data. As someone who in college definitely did my share of writing papers the night before they were due, I understand the terror and panic. On the other hand, I was usually surrounded by library books I had checked out (probably that day) with no fear that they might suddenly go blank.</p>
<p>The last pattern that I noticed was one of interesting hashtags. These included expected ones like #blackout, #badtiming, #PIPA, #stopSOPA, #wikipediablackout, and #sopa. But also some really bizarre ones that I have no idea how they related to the situation, and may simply remain a mystery: #fratproblems, #thekidsareourfuture, #BingGrlProblems, #SHOUTOUT, and #cooooooooooooooooooooooooooooooool. But my favourite one was probably #GoToALibrary!</p>
<p><strong>SO YOU WANNA CREATE A ZEITGEIST PARSE?</strong></p>
<p>Start by identifying your query parameters. Are you searching by words, by geography, by date? Remember that Twitter’s Search API only goes back a few days, so if you’re looking for anything older than a week this API won’t work. Twitter’s API documentation is great but does change every so often so keep an eye on <a href="https://dev.twitter.com//">Twitter Developers</a> for the most up-to-date information about their API and what you can use as parameters for the Search API.</p>
<p>Once you have defined your query, the next step is to create your ScraperWiki scraper with the information. Feel free to copy the source from one of my scrapers <a href="https://scraperwiki.com/scrapers/wikipedia_homework_scraper/">like this one</a>. and update with your own parameters.</p>
<p>Next you’ll need to set up the scraper to run one or more times. How often do you want it to run to get useful results? You can run it once &amp; download the data as a JSON or CSV file, or as a SQLite database. Or you can schedule it to run at regular intervals and download the info yourself each time.</p>
<p>After you have the data you need, all you have left to do is analyse. I mentioned ManyEyes earlier, which you can use to get some nice visualizations quite easily or you can use Excel or <a href="http://code.google.com/p/google-refine/">Google Refine</a> to parse and examine the data. If you’re comfortable with JavaScript, something like HighCharts can help to create nice, interactive visualizations easily from your data.</p>
<p>And now you have a good overview of what people think about the given topic in either a dataset or visualization. Hopefully you made some predictions about what you would find so you can validate your predictions or, as in my case here, observe something completely different.</p>
<p><strong>SUMMARY</strong></p>
<p>Writing a quick Python method using ScraperWiki to query Twitter’s search API is fairly straightforward. Finishing a term paper without using Wikipedia on the other hand? Not so straightforward for some unfortunate students!</p>
<p><em>You can read Martha&#8217;s articles about the Irish presidential election at <a href="http://martharotter.com/blog/index.php/2011/10/visualizing-aras-election/">Visualizing Aras Election</a> and <a href="http://martharotter.com/blog/index.php/2011/10/visualizing-aras-election-part-two/">Visualizing Aras Election, Part Two</a>.</em></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216635/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216635/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216635/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216635&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/04/17/parsing-panic/feed/</wfw:commentRss>
		<slash:comments>2</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/385f073a12b016d1a85c0fda88ce82d5?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">frabcus</media:title>
		</media:content>

		<media:content url="https://secure.gravatar.com/avatar.php?size=125&#38;d=identicon&#38;gravatar_id=9c94c3a7dae3a482ff5f7db2c3619430" medium="image" />

		<media:content url="http://scraperwiki.files.wordpress.com/2012/04/scraperwikiimg4.png" medium="image">
			<media:title type="html">Twitter tag cloud</media:title>
		</media:content>
	</item>
		<item>
		<title>Is scraping legal?</title>
		<link>http://blog.scraperwiki.com/2012/04/02/is-scraping-legal/</link>
		<comments>http://blog.scraperwiki.com/2012/04/02/is-scraping-legal/#comments</comments>
		<pubDate>Mon, 02 Apr 2012 20:39:47 +0000</pubDate>
		<dc:creator>Francis Irving</dc:creator>
				<category><![CDATA[thoughts]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216818</guid>
		<description><![CDATA[Lots of people, when they hear about ScraperWiki, ask &#8220;is scraping legal? how can you build a business off that?&#8221;. Usually to follow up by saying &#8220;we do it in our company, but we would never tell anyone&#8221;. This is &#8230; <a href="http://blog.scraperwiki.com/2012/04/02/is-scraping-legal/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216818&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://scraperwiki.files.wordpress.com/2012/04/supreme_court_of_the_united_states.jpg"><img class="alignright size-full wp-image-758216822" title="Supreme Court of the United States" src="http://scraperwiki.files.wordpress.com/2012/04/supreme_court_of_the_united_states.jpg?w=640" alt=""   /></a>Lots of people, when they hear about ScraperWiki, ask &#8220;is scraping legal? how can you build a business off that?&#8221;. Usually to follow up by saying &#8220;we do it in our company, but we would never tell anyone&#8221;.</p>
<p>This is strange to us, as we have come from a world of good scraping. Taking Government data, and making it easier for people to use for things that benefit all of society. We&#8217;re in favour of that kind of scraping.</p>
<p>It&#8217;s obviously a spectrum. At the other extreme, the most evil scraping would be to steal content that somebody else sells, and then to republish it at harm to their business. We&#8217;re against that kind of scraping.</p>
<p>It&#8217;s not scraping itself which is good or bad, or legal or illegal, but the circumstances in which you&#8217;re doing it.</p>
<p>We&#8217;ve written up in full our policy about the legality, it&#8217;s in our FAQ under &#8216;<a href="https://scraperwiki.com/docs/python/faq/#scraping_legality">What&#8217;s your policy on what&#8217;s legal to scrape?</a>&#8216;. Lots of details about robots.txt and take down notices, and what is our and your legal responsibility.</p>
<p>Finally, ScraperWiki isn&#8217;t just about scraping.</p>
<p>We&#8217;re a <a href="http://en.wikipedia.org/wiki/Data_hub">data hub</a>, and you need to get data into a data hub. As well as scraping, lots of people make API calls to do that on ScraperWiki, or download their own files from their own servers.</p>
<p>This is much more profound than it sounds &#8211; when you are using data for a new purpose, even if it is already structured, you still need to get it and convert it to your new needs. How you do that is a detail that depends on the circumstances.</p>
<p>The difference between parsing HTML web pages, and using a JSON REST API is surprisingly small. As an example, Thomas <a href="https://groups.google.com/forum/#!msg/scraperwiki/kyZ70Drw3zE/VJCvpWgzsegJ">scraped EventBrite</a> even though it has an API (see the post at the end of that thread by Ryan who works at EventBrite!), because it was easier at the time for him.</p>
<p>What matters is getting the data, and converting it into a form where it can <em>do something useful for the world</em>. And doing that legally. Whether you&#8217;re using <a href="http://nokogiri.org/">Nokogiri</a> or <a href="http://www.rubyinside.com/nestful-a-simple-ruby-http-rest-client-library-3227.html">Nestful</a>.</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216818/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216818/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216818/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216818&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/04/02/is-scraping-legal/feed/</wfw:commentRss>
		<slash:comments>3</slash:comments>
	
		<media:content url="http://1.gravatar.com/avatar/385f073a12b016d1a85c0fda88ce82d5?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">frabcus</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/04/supreme_court_of_the_united_states.jpg" medium="image">
			<media:title type="html">Supreme Court of the United States</media:title>
		</media:content>
	</item>
		<item>
		<title>&#8230;in data we trust&#8230;.</title>
		<link>http://blog.scraperwiki.com/2012/03/28/in-data-we-trust/</link>
		<comments>http://blog.scraperwiki.com/2012/03/28/in-data-we-trust/#comments</comments>
		<pubDate>Wed, 28 Mar 2012 15:11:53 +0000</pubDate>
		<dc:creator>ainemcguire</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216707</guid>
		<description><![CDATA[We&#8217;re in Washington DC, the nation&#8217;s capital and US HQ! The city is bathed in spring sunlight, the blossoms are out and there&#8217;s a bit of a buzz about the town. The ScraperWiki truck is getting ready to park at &#8230; <a href="http://blog.scraperwiki.com/2012/03/28/in-data-we-trust/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216707&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://scraperwiki.files.wordpress.com/2012/03/uncle_sam.jpg"><img class="wp-image-758216710 alignleft" title="uncle_sam" src="http://scraperwiki.files.wordpress.com/2012/03/uncle_sam.jpg?w=181&h=194" alt="" width="181" height="194" /></a>We&#8217;re in Washington DC, the nation&#8217;s capital and US HQ! The city is bathed in spring sunlight, the blossoms are out and there&#8217;s a bit of a buzz about the town. The ScraperWiki truck is getting ready to park at The Washington Post on Friday and Saturday for our 3<sup>rd</sup> major US Journalism Data Camp (hashtag: #jdcdc)</p>
<p>It&#8217;s an election year so we can be forgiven for feeling a little smug, our raison d&#8217;etre is to dig up data, so where better to make it happen than at The Washington Post, a newspaper that inspired a generation of investigative journalists, inscribed the word &#8216;Watergate&#8217; as a formal entry in the Oxford English Dictionary, and made &#8216;deep throat&#8217; a <em>double entendre</em>!</p>
<p><a href="http://scraperwiki.files.wordpress.com/2012/03/imag02293.jpg"><img class="alignright size-large wp-image-758216783" title="IMAG0229" src="http://scraperwiki.files.wordpress.com/2012/03/imag02293.jpg?w=1024&h=406" alt="" width="1024" height="406" /></a>Health, transport, education, security, they&#8217;re all ripe for data liberation. We&#8217;ve detected interest in &#8220;Super&#8221; PACS and lobbying data, so let&#8217;s hope we see a major focus on these at the event. One of AP&#8217;s senior investigative reporters <strong>Jack Gillum</strong>, (@jackgillum) is keen to drill into <a href="http://query.nictusa.com/cgi-bin/dcdev/indexp/1">Independent Expenditure</a> aka Election Advertisements, and <a href="http://www.fec.gov/finance/disclosure/efile_search.shtml">Campaign Finance Disclosure</a> data.  Our own <a href="http://en.wikipedia.org/wiki/Julian_Todd">Julian Todd</a> (@goatchurch) has commenced work on liberating <a href="https://scraperwiki.com/scrapers/ny_state_lobby_parse/">lobbying data</a> in New York.</p>
<p>The guys here at The Washington Post have a wish list for liberation and it&#8217;s by no means exhaustive:</p>
<ul>
<li><a href="http://www.montgomeryschoolsmd.org/departments/regulatoryaccountability/glance/fy2011/fy2011.shtm">School test scores</a></li>
<li><a href="http://www.fairfaxcounty.gov/circuitcourtdocket/Default.aspx">Court docket</a></li>
<li><a href="http://egov.princegeorgescountymd.gov/etrack/default.aspx">Spending disclosure</a></li>
<li><a href="http://www.polkcountyiowa.gov/inmatesontheweb/">Polk County Mug shots!</a></li>
<li><a href="http://www.iowa-city.org/icgov/apps/police/neighborhood.asp">IOWA City Calls for service</a></li>
</ul>
<p style="text-align:left;">We&#8217;re thrilled by the fact that we signed up so many data scientists and media professionals. The coders will be freeing and/or learning to scrape data and everyone else will be facilitated into teams to hypothesize, gather, analyze, create and present stories and applications based on data. The outcomes will be presented on Saturday at 04:00p and we have a bunch of prizes to give away for the most inspired ideas. We also have some special ScraperWiki prizes for technical contributions.</p>
<p style="text-align:left;"><a href="http://scraperwiki.files.wordpress.com/2012/03/agenda1.jpg"><img class="aligncenter size-medium wp-image-758216793" title="agenda" src="http://scraperwiki.files.wordpress.com/2012/03/agenda1.jpg?w=300&h=143" alt="" width="300" height="143" /></a></p>
<p><strong>What&#8217;s happening on Friday 30th?</strong></p>
<p><span style="text-decoration:underline;"><strong>08:30a</strong></span> We will open registration and serve tea coffee and biscuits</p>
<p><span style="text-decoration:underline;"><strong>09:30a</strong></span> Kick-off and a short plenary. We&#8217;ll hear from <strong><a href="http://www.thenewsliteracyproject.org/journalists/vernon_loeb/">Vernon Loeb</a></strong> (@VernonLoeb) about what it&#8217;s like to work as a data digger at the Post and <strong><a href="http://www.american.edu/soc/faculty/charlesl.cfm">Chuck Lewis</a></strong> (@crelewis) from AU will talk about partnership with the capital&#8217;s flagship publication. Our Own <strong><a href="http://en.wikipedia.org/wiki/Francis_Irving">Francis Irving</a></strong> (@frabcus) will say hello and talk &#8216;data&#8217;. <strong> Julian Todd</strong> (@goatchurch) and <strong>Thomas Levine</strong> (@thomaslevine) will explain why scraping is an important technique for getting data and show some examples.  <strong><a href="http://sunlightfoundation.com/people/tlee/">Tom Lee</a></strong> ((@tjl) from <a href="http://sunlightfoundation.com/">Sunlight Foundation</a> will make a <em>callout</em> for help with their <a href="http://services.sunlightlabs.com/gasp/legislators/K000148/">GASP</a> project and put some context around votesmart closing their doors.</p>
<p><span style="text-decoration:underline;"><strong>10:15a</strong></span> The Data Derby and Data Liberators will meet and pour over data ideas. We will review the lifecycle of a data driven story and familiarise people with the ScraperWiki Data Derby route map. We will set out some ideas and facilitate people into teams, with each picking a magnet as their map route icon. The coders who have signed up for the morning <em><span style="text-decoration:underline;"><strong>&#8216;Learn to Scrape</strong></span></em>&#8216; with python class will be directed to the tutorial room for the <em>three hour session.</em> Anyone signed up for the afternoon tutorial will join the data derby/liberators for some fun.</p>
<div id="attachment_758216" class="wp-caption aligncenter" style="width: 665px"><a href="http://scraperwiki.files.wordpress.com/2012/03/journalism-data-derby-board.jpg"><img class="wp-image-758216748  " title="Journalism data derby board" src="http://scraperwiki.files.wordpress.com/2012/03/journalism-data-derby-board.jpg?w=655&h=439" alt="" width="655" height="439" /></a><p class="wp-caption-text">Data Derby Route Map</p></div>
<p><strong>12:45a</strong> <em>Lightening Talk:</em> <strong>Greg Franczyk</strong> from<em> The Washington Post </em>will talk about the evolving role of data in the media industry and data&#8217;s evolution in media: specifically, how it is gathered and stored, its changing relationship with news, and how it&#8217;s presented to consumers<em>.<br />
<em>Callout &#8211; </em><a href="http://codeforamerica.org/author/mjumbe/"><strong>Mjumbe Poe</strong></a>, (@mjumbewu) Code for America Fellow would like to share the story of scraping council data for Councilmatic and he would like to get people interested in tackling the agendas.<br />
</em></p>
<p><span style="text-decoration:underline;"><strong>01:00p</strong></span> Light lunch</p>
<p><span style="text-decoration:underline;"><strong>01:30p</strong></span> Projects continue&#8230;</p>
<p><span style="text-decoration:underline;"><strong>02:15p</strong></span> &#8216;Learn to Scrape&#8221; Python afternoon tutorial commences <em>three hour tutorial</em></p>
<p><span style="text-decoration:underline;"><strong>05:30p</strong></span> Reception (Beer and Pizza).</p>
<p>******************Special NOTE********</p>
<p><strong>Learn to Scrape</strong><br />
<a href="http://scraperwiki.files.wordpress.com/2012/02/p1300216.jpg"><img class="alignleft" title="OLYMPUS DIGITAL CAMERA" src="http://scraperwiki.files.wordpress.com/2012/02/p1300216.jpg?w=120&amp;h=112&h=90" alt="" width="120" height="90" /></a>The two three hour tutorials Friday morning and afternoon will be run by our chief data scientist <a href="http://en.wikipedia.org/wiki/Julian_Todd">Julian Todd</a> (@goatchurch)and <a href="http://thomaslevine.com/">Thomas Levine</a> (@thomaslevine) data<a href="http://scraperwiki.files.wordpress.com/2012/03/tom.jpeg"><img class="alignright size-thumbnail wp-image-758216804" title="OLYMPUS DIGITAL CAMERA" src="http://scraperwiki.files.wordpress.com/2012/03/tom.jpeg?w=150&h=112" alt="" width="150" height="112" /></a> advocate aided and abetted by <a href="http://codeforamerica.org/author/michelle/">Michelle Koeth</a> (@michellekoeth) Code for America Fellow.  They will cover things like identifying good targets for webscraping and navigating the complexity of different types of web pages.  <a href="http://scraperwiki.files.wordpress.com/2012/03/michelle-and-julian.jpg"><img class="alignleft size-thumbnail wp-image-758216805" title="Michelle" src="http://scraperwiki.files.wordpress.com/2012/03/michelle-and-julian.jpg?w=125&h=150" alt="" width="125" height="150" /></a>Attendees will create their own scrapers .  The objective will be to get the data into a structured format, and join it with data from another source.  If time allows we will also try to encourage people to do further analysis.</p>
<p>*************************</p>
<p><strong>What&#8217;s happening on Saturday 31<sup>st</sup> March?</strong></p>
<p><span style="text-decoration:underline;"><strong>09:30a</strong></span> Welcome plus tea coffee and biscuits</p>
<p><span style="text-decoration:underline;"><strong>09:45</strong></span> Throughout the morning we will follow the Data Derby route map &#8211; please study the picture above.</p>
<p><span style="text-decoration:underline;"><strong>12:45a</strong></span><em>  Lightening talk &#8211; </em><a href="http://gillum.org/"><strong>J</strong></a><span style="font-family:Times New Roman,serif;"><span style="font-size:medium;"><a href="http://gillum.org/"><strong>ack Gillum</strong></a><strong>,</strong> (@jackgillum) AP Investigative Journalist and <a href="http://michelleminkoff.com/"><strong>Michelle Minkoff</strong></a> (@michelleminkoff) Interactive Producer will take about how “Super” PACs and big money have dominated this election cycle and tell us that there is little to fear as there is a mountain of data available on who’s backing presidential candidates and which can help journalists make sense of the big-time fundraisers this year. They&#8217;ll also talk about Federal Election Commission filings and show how they can be parsed for good storytelling”.</span></span><span style="text-decoration:underline;"><span style="font-family:Times New Roman,serif;"><span style="font-size:medium;"><em><span style="text-decoration:underline;"><em><br />
Callout</em>: <strong>Jan Scaffer</strong> (@janjlab) from <a href="http://www.j-lab.org/">J-Lab</a> wants to invite ideas from our participants on how to better organize and collect their data, which includes one of the largest databases of U.S. community news sites and a significant database of grant-funded media projects.</span><br />
</em></span></span></span></p>
<p><span style="text-decoration:underline;"><strong>0100p</strong></span> Light lunch</p>
<p><span style="text-decoration:underline;"><strong>0200p</strong></span> Project teams will finalize the details of their data stories in preparation for the presentation.</p>
<p><span style="text-decoration:underline;"><strong>03:00p</strong></span> Heading towards the finishing line&#8230;</p>
<p><span style="text-decoration:underline;"><strong>04:00p</strong></span> Presentations and Prizes.</p>
<p>The American University School of Communication has been amazingly supportive, <strong><a href="http://www.american.edu/profiles/staff/metcalf.cfm">Sharon Metcalf </a></strong>(Director Of Partnerships and Programs), is an absolute gem as are her colleagues <strong><a href="http://www.american.edu/soc/faculty/perri.cfm">Lynne Perri </a></strong>(@Lynneperri), Professor of Journalism and <strong>Chuck Lewis</strong> (@crelewis)Prof of Journalism and Executive Editor &#8211; Investigative Reporting Workshop) who were instrumental in getting the event off the ground. We have also been overwhelmed by the support from <strong>Vernon Loeb</strong> (@VernonLoeb) the Local Editor at The Washington Post who together with <strong>Greg Franczyk</strong> have set us up in their swish conference center.  A huge &#8216;thank you&#8217; to <strong>Jane Lockhart</strong> and her operations team for helping us with logistics. And last but by no means least a big round of applause to <strong>Associated Press</strong> for helping to fund our refreshments, <strong>Sunlight Foundation</strong> for our beer and pizza and to <strong>J-Lab</strong> for sponsoring the prizes – Hip Hip Horray!</p>
<div id="attachment_758216739" class="wp-caption alignleft" style="width: 650px"><a href="http://scraperwiki.files.wordpress.com/2012/03/imag0225.jpg"><img class="size-full wp-image-758216739" title="IMAG0225" src="http://scraperwiki.files.wordpress.com/2012/03/imag0225.jpg?w=640&h=960" alt="" width="640" height="960" /></a><p class="wp-caption-text">Eugene Meyer (Foyer - The Washington Post)</p></div>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216707/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216707/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216707/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216707&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/03/28/in-data-we-trust/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/a801e770feed3df03f36195443374935?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">ainemcguire</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/uncle_sam.jpg?w=279" medium="image">
			<media:title type="html">uncle_sam</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/imag02293.jpg?w=1024" medium="image">
			<media:title type="html">IMAG0229</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/agenda1.jpg?w=300" medium="image">
			<media:title type="html">agenda</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/journalism-data-derby-board.jpg?w=1024" medium="image">
			<media:title type="html">Journalism data derby board</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/02/p1300216.jpg?w=150&#38;h=112" medium="image">
			<media:title type="html">OLYMPUS DIGITAL CAMERA</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/tom.jpeg?w=150" medium="image">
			<media:title type="html">OLYMPUS DIGITAL CAMERA</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/michelle-and-julian.jpg?w=125" medium="image">
			<media:title type="html">Michelle</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/imag0225.jpg" medium="image">
			<media:title type="html">IMAG0225</media:title>
		</media:content>
	</item>
		<item>
		<title>International Data Journalism Awards&#8230;.deadline fast approaching..(10th April 2012)</title>
		<link>http://blog.scraperwiki.com/2012/03/26/international-data-journalism-awards-deadline-fast-approaching-10th-april-2012/</link>
		<comments>http://blog.scraperwiki.com/2012/03/26/international-data-journalism-awards-deadline-fast-approaching-10th-april-2012/#comments</comments>
		<pubDate>Mon, 26 Mar 2012 17:00:29 +0000</pubDate>
		<dc:creator>ainemcguire</dc:creator>
				<category><![CDATA[Uncategorized]]></category>
		<category><![CDATA[big data]]></category>
		<category><![CDATA[computer assisted reporting]]></category>
		<category><![CDATA[creative media]]></category>
		<category><![CDATA[data]]></category>
		<category><![CDATA[data driven journalism]]></category>
		<category><![CDATA[journalism]]></category>
		<category><![CDATA[media]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216688</guid>
		<description><![CDATA[Everybody is talking and trying to do &#8216;data journalism&#8217; and the first ever International Data Journalism Awards have been established to recognise the huge effort that people are making in this field.  It&#8217;s a great opportunity to showcase your work.  &#8230; <a href="http://blog.scraperwiki.com/2012/03/26/international-data-journalism-awards-deadline-fast-approaching-10th-april-2012/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216688&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><a href="http://scraperwiki.files.wordpress.com/2012/03/djawards.jpg"><img class="alignright size-medium wp-image-758216690" title="djawards" src="http://scraperwiki.files.wordpress.com/2012/03/djawards.jpg?w=300&h=86" alt="" width="300" height="86" /></a>Everybody is talking and trying to do &#8216;data journalism&#8217; and the first ever International Data Journalism Awards have been established to recognise the huge effort that people are making in this field.  It&#8217;s a great opportunity to showcase your work.  Backed by Google, the prizes are generous at €45,000 (over $55,000) to six winners and the process is being managed by <a href="http://www.globaleditorsnetwork.org">Global Editors</a></p>
<p>The main objectives are to a) Contribute to setting high standards and highlighting the best practices in data journalism and b) Demonstrate the value of data journalism among editors and media executives.</p>
<p>There are three categories :-</p>
<ol>
<li>Data-driven investigative journalism</li>
<li>Data visualisation &amp; storytelling</li>
<li>Data-driven applications</li>
</ol>
<p>The competition is open to media companies, non-profit organisations, freelancers and individuals. Applicants are welcome to submit their best data journalism projects <strong>before 10 April 2012</strong> at <a href="http://datajournalismawards.org/submit-your-work/">http://datajournalismawards.org/ submit-your-work/</a>.</p>
<p>To find out more about the competition and how to apply check out  <a href="http://www.datajournalismawards.org/">datajournalismawards.org</a>.  If you have any questions about the competition get in touch with the lovely Liliana Bounegru, DJA Coordinator (bounegru [at] ejc [dot] net). Liliana works at the <a href="http://www.ejc.nl">European Journalism Centre</a></p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216688/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216688/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216688/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216688&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/03/26/international-data-journalism-awards-deadline-fast-approaching-10th-april-2012/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/a801e770feed3df03f36195443374935?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">ainemcguire</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/djawards.jpg?w=300" medium="image">
			<media:title type="html">djawards</media:title>
		</media:content>
	</item>
		<item>
		<title>Fine set of graphs at the Office of National Statistics</title>
		<link>http://blog.scraperwiki.com/2012/03/22/fine-set-of-graphs-at-the-office-of-national-statistics/</link>
		<comments>http://blog.scraperwiki.com/2012/03/22/fine-set-of-graphs-at-the-office-of-national-statistics/#comments</comments>
		<pubDate>Thu, 22 Mar 2012 11:47:01 +0000</pubDate>
		<dc:creator>Julian Todd</dc:creator>
				<category><![CDATA[opendata]]></category>
		<category><![CDATA[data]]></category>
		<category><![CDATA[population]]></category>
		<category><![CDATA[scraperwiki]]></category>
		<category><![CDATA[scraping]]></category>
		<category><![CDATA[visualization]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216643</guid>
		<description><![CDATA[It&#8217;s difficult to keep up. I&#8217;ve just noticed a set of interesting interactive graphs over at the Office of National Statistics (UK). If the world is about people, then the most fundamental dataset of all must be: Where are the &#8230; <a href="http://blog.scraperwiki.com/2012/03/22/fine-set-of-graphs-at-the-office-of-national-statistics/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216643&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p>It&#8217;s difficult to keep up. I&#8217;ve just noticed a set of <a href="http://www.ons.gov.uk/ons/guide-method/understanding-ons-statistics/interactive-content/index.html">interesting interactive graphs</a> over at the Office of National Statistics (UK).</p>
<p>If the world is about people, then the most fundamental dataset of all must be: Where are the people? And: What stage of life are they living through?</p>
<p>A Population Pyramid is a straightforward way to visualize the data, like so:</p>
<p><a href="http://www.neighbourhood.statistics.gov.uk/HTMLDocs/dvc4/subnational.html"><img class="aligncenter size-full wp-image-758216644" title="populationpyramid1" src="http://scraperwiki.files.wordpress.com/2012/03/populationpyramid1.png?w=640&h=459" alt="" width="640" height="459" /></a></p>
<p>This image is sufficient for determining what needs to be supplied (eg more children means more schools and toy-shops), but it doesn&#8217;t explain why.</p>
<p><span id="more-758216643"></span></p>
<p>The &#8220;why?&#8221; and &#8220;what&#8217;s going on?&#8221; questions are much more interesting, but are pretty much guesswork because they refer to layers in the data that you cannot see. For example, the number of people in East Devon of a particular age is the sum of those who have moved into the area at various times, minus those who have moved away (temporarily or permanently), plus those who were already there and have grown older but not yet died. For any bulge, you don&#8217;t know which layer it belongs to.</p>
<p>In this 2015 population pyramid there are bulges at 28, 50 and a pronounced spike at 68, as well as dips at 14 and 38. In terms of birth years, these correspond to 1987, 1965 and 1947 (spike), and dips at 2001 and 1977.</p>
<p>You can pretend they correspond to recessions, economic boom times and <a href="http://en.wikipedia.org/wiki/Second-wave_feminism">second wave feminism</a>, but the 1947 post-war spike when a mass of men-folk were demobilized from the military is a pretty clean signal.</p>
<p>What makes this data presentation especially lovely is that it is localized, so you can see <a href="http://www.neighbourhood.statistics.gov.uk/HTMLDocs/dvc4/subnational.html">the population pyramid per city</a>:</p>
<p><a href="http://www.neighbourhood.statistics.gov.uk/HTMLDocs/dvc4/subnational.html"><img class="aligncenter size-full wp-image-758216654" title="populationpyramid2" src="http://scraperwiki.files.wordpress.com/2012/03/populationpyramid2.png?w=640&h=379" alt="" width="640" height="379" /></a></p>
<p>Cambridge, as everyone knows, is a university town, which explains the persistent spike at the age 20.</p>
<p>And, while it looks like there is gender equality for 20 year old university students, there is a pretty hefty male lump up to the age of 30 &#8212; possibly corresponding folks doing higher degrees. Is this because fewer men are leaving town at the appropriate age to become productive members of society, or is there an influx of foreign grad students from places where there is less of a gender equality? The data set of student origins and enrollments would give you the story.</p>
<p>As to the pyramid on the right hand side, I have no idea what is going on in <a href="http://en.wikipedia.org/wiki/London_Borough_of_Camden">Camden</a> to account for that bulge in 30 year olds. What is obvious, though, is that the bulge in infants must be related. In fact, almost all the children between the ages of 0 and 16 years will have corresponding parents higher up the same pyramid. Also, there is likely to be a pairwise cross-gender correspondence between individuals of the same generation living together.</p>
<p>These internal links, external data connections, sub-cohorts and new questions raised the more you look at it means that it is impossible to create a single all-purpose visualization application that could serve all of these. We can wonder as to whether an interface which worked via javascript-generated SQL calls (rather than flash and server-side queries) would have enabled someone with the right skills to roll their own queries and, for example, immediately find out which city and age group has the greatest gender disparity, and whether all spikes at the 20-year-old age bracket can be accounted for by universities.</p>
<p>For more, see <a href="http://www.ons.gov.uk/ons/guide-method/method-quality/specific/population-and-migration/an-overview-of-ons-s-population-statistics/index.html">An overview of ONS&#8217;s population statistics</a>.</p>
<p>As it is, someone is going to have to download/scrape, parse and load at least one year of <a href="http://www.ons.gov.uk/ons/rel/pop-estimate/population-estimates-for-uk--england-and-wales--scotland-and-northern-ireland/population-estimates-timeseries-1971-to-current-year/index.html">source data</a> into a data hub of their choice in order to query this (we&#8217;ve started on 2010&#8242;s figures <a title="2010 ONS Population Estimates, scraped by 'Dragon' Dave McKee" href="https://scraperwiki.com/scrapers/ons-popestimates/" target="_blank">here on ScraperWiki</a> – take a look). Once that&#8217;s done, you&#8217;d be able to sort the cities by the greatest ratio between number of 20 year olds and number of 16 year olds, because that&#8217;s a good signal of student influx.</p>
<p>I don&#8217;t have time to get onto the <a href="http://www.ons.gov.uk/ons/publications/all-releases.html?definition=tcm%3A77-21600">Population projection models</a>, where it really gets interesting. There you have all the clever calculations based on guestimates of migration, mortality and fertility.</p>
<p>What I would really like to see are these calculations done live and interactively, as well as combined with economic data. Is the state pension system going to go bankrupt because of the &#8220;baby boomers&#8221;? Who knows? I know someone who doesn&#8217;t know: someone who&#8217;s opinion does not rely (even indirectly) on something approaching a dynamic data calculation. I mean, if the difference between solvency and bankruptcy is within the margin of error in the estimate of fertility rate, or 0.2% in the tax base, then that&#8217;s not what I&#8217;d call bankrupt. You can only find this out by tinkering with the inputs with an element of curiosity.</p>
<p>Privatized pensions ought to be put into the model as well, to give them the macro-economic context that no pension adviser I&#8217;ve ever known seems capable of understanding. I mean, it&#8217;s evident that the stock market (in which private pensions invest) does happen to yield a finite quantity of profit each year. Ergo it can support a finite number of pension plans. So a national policy which demands more such pension plans than this finite number is inevitably going to leave people hungry.</p>
<p>Always keep in mind the long term vision of data and governance. In the future it will all come together like transport planning, or the procurement of adequate rocket fuel to launch a satellite into orbit; a matter of measurements and predictable consequences. Then governance will be a science, like chemistry, or the prediction of earthquakes.</p>
<p style="padding:18px 20px;background:#F2F7FC;"><strong>But don&#8217;t forget:</strong> we can&#8217;t do anything without first getting the raw data into a usable format. <a href="https://scraperwiki.com/scrapers/ons-popestimates/">Dave McKee&#8217;s started on 2010&#8242;s data here</a> … fancy helping out?</p>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216643/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216643/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216643/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216643&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/03/22/fine-set-of-graphs-at-the-office-of-national-statistics/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/ae3cb03a98a6470bdf839dd84a226e47?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">goatchurch</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/populationpyramid1.png" medium="image">
			<media:title type="html">populationpyramid1</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/populationpyramid2.png" medium="image">
			<media:title type="html">populationpyramid2</media:title>
		</media:content>
	</item>
		<item>
		<title>Telling Stories with Data: Life at a Hispanic Serving University in Texas!</title>
		<link>http://blog.scraperwiki.com/2012/03/19/telling-stories-with-data-life-at-a-hispanic-serving-university-in-texas/</link>
		<comments>http://blog.scraperwiki.com/2012/03/19/telling-stories-with-data-life-at-a-hispanic-serving-university-in-texas/#comments</comments>
		<pubDate>Mon, 19 Mar 2012 18:32:56 +0000</pubDate>
		<dc:creator>ainemcguire</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">http://blog.scraperwiki.com/?p=758216647</guid>
		<description><![CDATA[Guest post by Cindy Royal I&#8217;m an associate professor in the School of Journalism and Mass Communication at Texas State University in San Marcos. We&#8217;re just a short distance from Austin, with a large (&#62;34,000 students) and diverse campus. Since &#8230; <a href="http://blog.scraperwiki.com/2012/03/19/telling-stories-with-data-life-at-a-hispanic-serving-university-in-texas/">Continue reading <span class="meta-nav">&#8594;</span></a><img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216647&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></description>
			<content:encoded><![CDATA[<p><em>Guest post by Cindy Royal</em></p>
<p><a href="http://scraperwiki.files.wordpress.com/2012/03/cindy-royal-texas-state-event-21st-march-2012.jpg"><img class="alignleft size-full wp-image-758216648" title="Cindy Royal - Texas State event 21st March 2012" src="http://scraperwiki.files.wordpress.com/2012/03/cindy-royal-texas-state-event-21st-march-2012.jpg?w=640" alt=""   /></a>I&#8217;m an associate professor in the School of Journalism and Mass Communication at Texas State University in San Marcos. We&#8217;re just a short distance from Austin, with a large (&gt;34,000 students) and diverse campus. Since I joined the faculty at Texas State, I have been focusing on advancing students&#8217; technology skills and exposing them to the perspective of data journalism. For the 2011-2012 academic year, my colleague Jacie Yang and I were awarded a grant sponsored by the Association for Education in Journalism and Mass Communication and the Knight Foundation called <a href="http://www.aejmc.org/topics/archives/3412" target="_blank">Building a Bridge Between the Knight News Challenge and JMC Programs</a>. The goal of our proposal is to develop a site that focuses on Texas State&#8217;s recent designation as a <a href="http://www.txstate.edu/news/news_releases/news_archive/2011/March-2011/HSIRelease032411.html" target="_blank">Hispanic Serving Institution</a>. We chose to partner with the Knight News Challenge winner The Jefferson Institute and use their Drupal-based VIDI data visualization modules, but we also wanted to broadly expose students to more of the innovative work of News Challenge winners through a semester-long speaker series. Thus the <a href="http://cindyroyal.net/advanced/?q=content/telling-stories-data-speaker-series" target="_blank">Telling Stories with Data</a> series was conceived.</p>
<p>One of the first organizations we considered was Scraperwiki. I have long been following the work on their site, and I appreciate the approach of a simple Web interface that allows the user to customize code in a variety of languages. We are using my <a href="http://cindyroyal.net/advanced" target="_blank">Advanced Online Media course</a> as the platform for the grant, and that class consists of graduate students (and one brave undergraduate) who have a communications background but are interested in advancing their technology skills and knowledge as it relates to storytelling. They have all taken an introductory Web design class and are now being introduced to higher-end programming including JQuery, Content Management Systems, PHP/MySQL and Ruby on Rails. But, they are not developers. We are approaching this by giving the students a basic understanding of code and then providing access to tools that they can easily manipulate. The Scraperwiki platform is perfect for this, and I have watched with interest as the site has evolved.</p>
<p>On March 21st, right after our major project covering SXSW on our SXTXState.com blog, we will welcome Thomas Levine, senior developer at Scraperwiki, to our campus for a scraping workshop. Levine will introduce students to the concept of scraping and help them develop a project and analyze the data. We are very excited to learn more about the platform and how we can be most productive in gaining access to information we find online.</p>
<p>Other speakers in the series include Aron Pilhofer, director of Interactive News at <a href="http://nytimes.com/" target="_blank">The New York Times</a>, and Dante Chinni, of the <a href="http://www.jeffersoninst.org/" target="_blank">Jefferson Institute</a> and <a href="http://www.patchworknation.org/" target="_blank">Patchwork Nation</a>. In addition to my class, we have invited students from other classes to attend, as well as local news organizations including the <a href="http://statesman.com/" target="_blank">Austin American-Statesman</a> and the <a href="http://texastribune.org/" target="_blank">Texas Tribune</a>.</p>
<p>You can find more information about me and the courses I teach at the links below:</p>
<ul>
<li>Cindy Royal&#8217;s website &#8211; <a href="http://cindyroyal.com/" target="_blank">cindyroyal.com</a></li>
<li>Advanced Online Media course site &#8211; <a href="http://cindyroyal.net/advanced" target="_blank">cindyroyal.net/advanced</a></li>
</ul>
<br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gofacebook/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/facebook/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gotwitter/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/twitter/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/scraperwiki.wordpress.com/758216647/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/scraperwiki.wordpress.com/758216647/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/scraperwiki.wordpress.com/758216647/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=blog.scraperwiki.com&#038;blog=14548467&#038;post=758216647&#038;subd=scraperwiki&#038;ref=&#038;feed=1" width="1" height="1" />]]></content:encoded>
			<wfw:commentRss>http://blog.scraperwiki.com/2012/03/19/telling-stories-with-data-life-at-a-hispanic-serving-university-in-texas/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
	
		<media:content url="http://0.gravatar.com/avatar/a801e770feed3df03f36195443374935?s=96&#38;d=identicon&#38;r=G" medium="image">
			<media:title type="html">ainemcguire</media:title>
		</media:content>

		<media:content url="http://scraperwiki.files.wordpress.com/2012/03/cindy-royal-texas-state-event-21st-march-2012.jpg" medium="image">
			<media:title type="html">Cindy Royal - Texas State event 21st March 2012</media:title>
		</media:content>
	</item>
	</channel>
</rss>
