Query

#!/home/public/cgi-bin/newlisp ; Retrieves a page with 20 links via Google containing ; the search word, then spawns 20 child processes, each ; retrieving one page from a link. All pages are then ; HTML-cleaned, tokenized and words are counted using ; bayes-train. Only for Mac OS X, Linux and other Unix, ; does not run on Windows. ; USAGE: query ; or rename to query.cgi and run on a web server ; using the following query.html: ; NOTE!!! Google changes the formatting of links (see LINK_REGEX) once ; in a while and then the whole thing breaks! ; ;Query ; ;

Parallel processing support on newLISP using 'spawn'

; ; ;(set 'cgi-mode nil) (set 'cgi-mode true) ; run as CGI get question from query.html (when cgi-mode (load "cgi.lsp") (print "Content-Type: text/html\n\n")) ;; globals and constants in this namespace (set 'LINK_REGEX {q=(http://[^>& ]+)&}) (set 'START_TIME 0) (set 'SITES '()) (set 'CONTENT "") (constant 'BRK (if cgi-mode "
" "")) (define (url-decode str) (replace "%([0-9A-F][0-9A-F])" str (char (int $1 0 16)) 1)) (define (url-encode str) (replace {([^a-zA-Z0-9])} str (format "%%%2X" (char $1)) 0)) (define (clean-html text) (replace "" text "" 4) (replace "" text "" 4) (replace "" text "" 4) (replace "<[^>]*>" text " " 0) (replace "." text " . ") (replace "," text " , ") (replace "!" text " ! ") (replace "?" text " ? ") (replace "\n" text " ") (replace "\r" text "") (replace "\t" text " ") (replace " +" text " " 4) ; this should be a list of all ISO 8859-1 Symbols ; see http://www.w3schools.com/tags/ref_entities.asp ; HTML entities should be replaced with UTF-8 (replace " " text " ") (replace "—" text " - ") (replace "–" text " - ") (replace """ text "'") (replace "&" text "&") ;(replace "‎" text (char 8206)) ;(replace {\&#(\d+);} text (char (int $1)) 0) ) (if cgi-mode (set 'question (CGI:get "query")) (set 'question (last (parse (main-args -1) "/"))) ; default is 'query' ) (set 'page (get-url (string "http://www.google.com/search?q=" (url-encode question) "&num=22&ie=UTF-8&oe=UTF-8"))) (set 'links (find-all LINK_REGEX page $1)) (when cgi-mode (println {}) (println {}) (println {})) (println "query term: " (url-decode question) BRK) (set 'START_TIME (time-of-day)) ######################### this is, where all the pages are retrieved ################# ; spawn a childprocess for each link (dolist (lnk (0 20 links)) (set 'pid (spawn 'page (get-url lnk 4000))) (push (list pid lnk) SITES -1)) ; this gets executed whenever a page has been retrieved (define (page-ready-event pid) (let (link (0 80 (lookup pid SITES))) (set 'link (url-decode link)) (println (inc cnt) " pid:" pid " " (- (time-of-day) START_TIME) " ms " link BRK) ; avoid error execptions on invalid utf-8 when cleaning page (catch (push (lower-case (clean-html page)) CONTENT -1) 'result) (inc xferred (length page))) ) ; start waiting for pages (println "waiting: ..." BRK BRK) (unless (sync 10000 page-ready-event) (println BRK "timeout" BRK)) ##################################################################################### (println BRK "bytes transferred: " xferred BRK) (println "total time: " (- (time-of-day) START_TIME) " ms" BRK) ######################### all the counting is done here using bayes-train ########### (catch (load "Lex") 'result) (println "training: " (time (bayes-train (find-all {[a-zA-Z]+} CONTENT) 'Lex)) " ms" BRK) (println "total words processed: " (Lex:total 0) BRK) (println BRK) ##################################################################################### ; sort by frequency and print in four columns (set 'items (sort (Lex) (fn (x y) (> (x 1 0) (y 1 0))))) (set 'items (transpose (explode items (ceil (div (length items) 4))))) (when cgi-mode (println "

"))
(dolist (line items)
    (dolist (item line)
        (if (and item (< (length (item 0)) 19)) 
            (print (format "%-18s %5d, " (item 0) (item 1 0)))
            (print (format "%-18s      , " " ")))) ; nil
    (println))
		
(when cgi-mode
    (println "

") (println "")) ; if accumulating results over various queries ;(save "Lex" 'Lex) (exit)