""" Queries google across a matrix of computing-themed search terms. Intended to find all of the implementations of one programming language in another, and other things of interest. """ import time, sys import google import urlparse import cPickle as pickle _PAGE = """\ TechGoogle

fish.cx


Up to: fish.cx

TechGoogle

The result of searching SourceForge for some computing terms.

HEADING_ROW DATA_ROWS

This page was generated by techgoogle.py, %s, thanks to PyGoogle doing all the hard work.

""" _HEADING_ROW = """\ NAME """ _DATA_ROWS = """\ DATA_ROW """ _DATA_ROW = """\ PROJECTS """ _PROJECTS = """NAME""" def render_page( terms, matrix, search_time ): page = _PAGE % search_time heading_row_items = [""] # empty top-left corner for term in terms: heading_row_items.append( "%s" % term ) indent = " "*12 page = page.replace("HEADING_ROW",("\n"+indent).join(heading_row_items)) # # Rows. # data_rows_html = [] for t1 in terms: row_html = _DATA_ROWS row_items_html = [ _DATA_ROW.replace("PROJECTS",t1).lstrip() ] # row name for t2 in terms: if t1==t2: row_items_html.append("-") continue _t1_, _t2_, projects = matrix.pop(0) try: assert t1==_t1_ assert t2==_t2_ except: print t1,t2,_t1_,_t2_ indent = " "*16 item_html = [] for project_name,URL in projects: proj_html = _PROJECTS.replace("URL",URL).replace("NAME",project_name) item_html.append(proj_html) item_html = ("
\n"+indent).join(item_html) item_html = _DATA_ROW.replace("PROJECTS",item_html) row_items_html.append(item_html) indent = " "*0 row_items_html = _DATA_ROWS.replace("DATA_ROW",("\n"+indent).join(row_items_html)) data_rows_html.append(row_items_html) indent = " "*0#8 page = page.replace( "DATA_ROWS", ("\n"+indent).join(data_rows_html) ) print page def techgoogle( terms ): ret = [] for t1 in terms: for t2 in terms: if t2==t1: continue query = '"%s" "%s" site:sourceforge.net' % (t1,t2) print >>sys.stderr, "Search:", query results = google.doGoogleSearch(query) ret.append( (t1,t2,results) ) return ret def main(): TERMS = """ python lua perl java javascript C++ xml soap corba rpc svg graph blog database image wiki mozilla openoffice apache wxWidgets """.split() if 1: # # Reload all the values from google (only 1000 queries allowed per day, remember!) # search_time = long(time.time()) all_results = techgoogle(TERMS) F = open("techgoogle_results_%s.txt" % search_time, "wt") pickle.dump( all_results, F ) F.close() del F else: # Load previously-saved results. for t in ["image","wiki","blog","database","mozilla","openoffice","apache","wxWidgets"]: TERMS.remove(t) search_time = 1086021513 all_results = pickle.load( file("techgoogle_results_%s.txt" % search_time) ) search_time = time.asctime( time.localtime(search_time) ) matrix = [] for t1, t2, search in all_results: sf_projects = [] for page in search.results: URL = page.URL if ".sourceforge.net" in URL or ".sf.net" in URL: project_name = urlparse.urlparse(URL)[1].split(".")[0] found = False for pname,_ in sf_projects: if pname == project_name: found = True break if not found: sf_projects.append( (project_name,URL) ) matrix.append( (t1,t2,sf_projects) ) render_page( TERMS, matrix, search_time ) if __name__=="__main__": main()