""" to create the html docs of this script run it thru epydoc python yahoo_stripper.py produces the example file clean.html (basically you are doing a search on yahoo for "fravia searching" There was a discussion on the board on how to parse html. One person suggested using regular expressions. Not a good idea too prone to mistakes. Anyway here is how to scrub the links and descriptions out of a yahoo search. And generate your own nice and clean webpage ;). Lots of room for improvement, meant to be a learning tool. references http://www.voidspace.org.uk/python/articles/urllib2.shtml http://diveintopython.org/html_processing/ http://epydoc.sourceforge.net XML parsers come in 2 different flavors DOM and SAX. HTML is an ancestor of XML. sgmllib is a parser for html that is of the SAX flavor The parser in javascript is a DOM parser since it parses the entire webpage and makes a "tree". SAX Parser always faster but you have less control. Anyway the page if fetched from yahoo using urllib2. Then it is fed thru YahooHTMLParser. in self.clean_links lo and behold ends up a list of the links on yahoo, allong with the abstract (if it is there). Then I take the list and make my own webpage ;) Again I am not evangelizing python, you can do the same thing in Perl, or Scheme or any other language that has a html parser. Even in python there are other html parsers besides sgml. """ import urllib2 import urllib from sgmllib import SGMLParser class YahooHTMLParser(SGMLParser): """Our Parser which we have created to strip the garbage from yahoo""" class CleanLink(object): """convenience class to hold the cleaned links from yahoo""" def __init__(self): self.link = None self.abstract = [] def feed(self,page): """This is where we initialize the I{state variables} think of it as the constructor""" self.in_link = False self.in_abstract = False self.clean_links = [] self.current_link = None SGMLParser.feed(self,page) def start_div(self,attrs): """catch div attributes. I{Abstracts} in yahoo are stored in a div with a class attribute of "yschabstr" """ attrs = dict(attrs) try : cls = attrs["class"] if cls == "yschabstr" : self.in_abstract = True except KeyError : pass def end_div(self): """In a more complex parser we should count the number of divs we have entered. Here we just blindly assume that if a div has ended and we are in an abstract we should just leave. Notice I save the strings as a list then at the end join the strings. This is because of how python handles strings and an optimization trick. (search on web for optimizing python) """ if self.in_abstract : self.in_abstract = False self.current_link.abstract="".join(self.current_link.abstract) def start_a(self,attrs): """look for the anchor which has class attribute of "yschttl" """ attrs = dict(attrs) try : if attrs["class"] == "yschttl" : self.current_link = self.CleanLink() self.clean_links.append(self.current_link) self.current_link.link = attrs["href"] self.in_link = True except KeyError: pass def end_a(self): if self.in_link : self.in_link = False def unknown_starttag(self,tag,attrs) : """if we are in an abstract just save the html tag so it looks the same""" if self.in_abstract : self.current_link.abstract.append(self.get_starttag_text()) def unknown_endtag(self,tag) : if self.in_abstract : self.current_link.abstract.append("%s>"%(tag)) def handle_data(self,text): """If we are in abstract save the text""" if self.in_abstract : self.current_link.abstract.append(text) def main(): """main procedure fetch the search query, feed it into the parser and then create our own simple web page called "clean.html" """ yahoo_url = "http://search.yahoo.com/search?%s" params = {"p" : "fravia searching", "ei" : "UTF-8", "fr" : "sfp"} url_params = urllib.urlencode(params) f = urllib2.urlopen(yahoo_url%(url_params)) page = f.read() f.close() yahParser = YahooHTMLParser() yahParser.feed(page) yahParser.close() link_html="