<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>giencke.net &#187; wwiigis</title>
	<atom:link href="http://www.giencke.net/category/wwiigis/feed/" rel="self" type="application/rss+xml" />
	<link>http://www.giencke.net</link>
	<description>The homepage for all things giencke</description>
	<lastBuildDate>Thu, 05 Jan 2012 01:58:34 +0000</lastBuildDate>
	<language>en</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.3.1</generator>
		<item>
		<title>Cataloging Ebay Purchases the Hard Way</title>
		<link>http://www.giencke.net/2010/01/cataloging_ebay_purchases_the_hard_way/</link>
		<comments>http://www.giencke.net/2010/01/cataloging_ebay_purchases_the_hard_way/#comments</comments>
		<pubDate>Sun, 10 Jan 2010 02:30:01 +0000</pubDate>
		<dc:creator>admin</dc:creator>
				<category><![CDATA[programming]]></category>
		<category><![CDATA[wwiigis]]></category>

		<guid isPermaLink="false">http://www.giencke.net/?p=10182</guid>
		<description><![CDATA[So as part of the <a href="http://www.wwiigis.org">wwiigis.org</a> effort, I'm buying a ton of World War 2-related imagery from Ebay, including a lot of neat aerial photography. Once I win an auction, I save the auction page in Firefox via 'Save Page As' which writes out an html page, and folder containing images for that auction. In getting a separate blog post together, I wanted to have access to the auction title and auction photograph for each auction, without having to manually go through a million folders. So what follows is a quick and dirty Python script that gets the job done, albeit in a pretty ineffecient way. I'm sharing this script on the assumption that there's one other person in universe in a similiar situation, who doesn't want to waste an hour in programming land.

Click continue, below, for the code. And check out the latest images, also below (soon to be live on wwiigis.org):

<embed type="application/x-shockwave-flash" src="http://picasaweb.google.com/s/c/bin/slideshow.swf" width="420" height="267" flashvars="host=picasaweb.google.com&#038;captions=1&#038;hl=en_US&#038;feat=flashalbum&#038;RGB=0x000000&#038;feed=http%3A%2F%2Fpicasaweb.google.com%2Fdata%2Ffeed%2Fapi%2Fuser%2Fgiencke%2Falbumid%2F5424883124468029681%3Falt%3Drss%26kind%3Dphoto%26hl%3Den_US" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed>
]]></description>
			<content:encoded><![CDATA[<p>So as part of the <a href="http://www.wwiigis.org">wwiigis.org</a> effort, I&#8217;m buying a ton of World War 2-related imagery from EBay, including a lot of neat aerial photography. Once I win an auction, I save the auction page in Firefox via &#8216;Save Page As&#8217; which writes out an html page, and folder containing images for that auction. In getting a separate blog post together, I wanted to have access to the auction title and auction photograph for each auction, without having to manually go through a million folders. So what follows is a quick and dirty Python script that gets the job done, albeit in a pretty ineffecient way. I&#8217;m sharing this script on the assumption that there&#8217;s one other person in universe in a similiar situation, who doesn&#8217;t want to waste an hour in programming land.</p>
<p>Check out the latest images below (soon to be live on wwiigis.org):</p>
<p><embed type="application/x-shockwave-flash" src="http://picasaweb.google.com/s/c/bin/slideshow.swf" width="420" height="267" flashvars="host=picasaweb.google.com&#038;captions=1&#038;hl=en_US&#038;feat=flashalbum&#038;RGB=0x000000&#038;feed=http%3A%2F%2Fpicasaweb.google.com%2Fdata%2Ffeed%2Fapi%2Fuser%2Fgiencke%2Falbumid%2F5424883124468029681%3Falt%3Drss%26kind%3Dphoto%26hl%3Den_US" pluginspage="http://www.macromedia.com/go/getflashplayer"></embed></p>
<p>And here&#8217;s the code:</p>
<pre>
import os
import re
import operator
from PIL import Image

# A script to extract auction titles and
# copy a representative image
# to a specified output directory.
# giencke@gmail.com

auction_title = re.compile(
    '&lt;title&gt;(.*?)&lt;\/title', re.IGNORECASE)
auction_images = re.compile(
    'img.*?src=\"(.*?)\"\s', re.IGNORECASE)

# Directory containing saved ebay pages
INPUT_DIRECTORY = "some dir"

# Output directory to store representative
# image from auction, maybe
OUTPUT_DIRECTORY = 'some dir'

# Output file to contain listing of files, and copied images
OUTPUT_TEXTFILE_NAME = 'images.txt'

# Copy image to OUTPUT_DIRECTORY?
COPY_IMAGE = True

# Some images aren't worth processing, those go here
EBAY_IMAGES_TO_IGNORE = ['logoEbay_x45.gif', 'noscript']
output_file_obj = open(
    os.path.join(OUTPUT_DIRECTORY,
                 OUTPUT_TEXTFILE_NAME), 'w')

# Recusively go through images folders
# containing saved ebay pages
for ebay_folders in os.walk(INPUT_DIRECTORY):

  # The first tuple value is the directory
  parent_directory = ebay_folders[0]

  # open the actual auction html
  ebay_page = '%s.htm' %  ebay_folders[0].strip('_files')
  try:
    in_html = open(ebay_page).read()
  except IOError:
    continue

  # The auction name lives in the html title
  auction_name = ''.join(auction_title.search(
      in_html).groups()[0].split(' - ')[:-1])

  # This tuple will be used to contain the largest image
  largest_image = (0,)

  for image in set(auction_images.findall(in_html)):

    image_basename = os.path.basename(image)
    image_to_open = os.path.join(ebay_folders[0],
                                 image_basename)

    # open up the image to get dimensions, etc
    try:
      in_image = Image.open(image_to_open)
    except IOError:
      continue

    if image_basename not in EBAY_IMAGES_TO_IGNORE:
      # TODO: Check for preferred formats

      width, height = in_image.size
      image_dimensions = width * height
      if image_dimensions &gt;= largest_image[0]:
        largest_image = (image_dimensions,
                         image_to_open,
                         '%sx%s' % (width, height))

  # Format for output text
  output_file_obj.write('%s || %s || %s\n' % (
      auction_name.title(), os.path.basename(largest_image[1]),
                                             largest_image[-1]))

  if COPY_IMAGE:
    out_image = Image.open(largest_image[1])
    out_image.save(os.path.join(
        OUTPUT_DIRECTORY, os.path.basename(largest_image[1])))
output_file_obj.close()
</pre>
]]></content:encoded>
			<wfw:commentRss>http://www.giencke.net/2010/01/cataloging_ebay_purchases_the_hard_way/feed/</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	<div id="test_map" style="display:none" ></div></channel>
</rss>

