#!/usr/bin/perl #文件:getproxy.pl #用途:通过搜索引擎获取当前可用的HTTP/Socks5代理 #作者:watercloud (watercloud@xfocus.org watercloud@nsfocus.com) #编写:2005-6-23 #更新:2005-11-5 use strict; use Data::Dump qw(dump); use LWP::UserAgent; use HTML::LinkExtor; use URI::URL; use URI::Escape; use Getopt::Long; use threads; use threads::shared; use Thread::Semaphore; #down up use IO::Socket::Socks; my $DEBUG=1; my $SEARCH_NUM=100; my $GOOGLE="http://www.google.com.tw/search?ie=gb2312&oe=UTF-8&num=$SEARCH_NUM&q="; my $BAIDU="http://www.baidu.com/s?rn=100&wd="; my $m_key="代理 每日更新 "; my $m_test_site="www.microsoft.com"; my $m_test_port=80; my $m_proxy_type="http"; my $m_proxy_file; my $m_url_file; my $m_timeout=10; my $m_max_num=10; my $m_thread_num=10; my $m_raw_uri; my $m_out_format=""; &getopts(); my $m_sem=Thread::Semaphore->new($m_thread_num); my $m_get_num=0; my $exit_flag=0; share $m_get_num; share $exit_flag; &main(); #--------------------subs--------------- sub main { my %url_hist; my @threads; my $rurls; if($m_proxy_file) { my $f; open $f,"<$m_proxy_file" or die "Open file $m_proxy_file error."; my @t=<$f>; close $f; exit 1 if @t == 0 ; print "Read file ok.\n" if $DEBUG; chomp @t; my @tds = &test_proxys(\@t); push @threads,@tds; } else { if($m_url_file) { my $f; open $f,"<$m_url_file" or die "Open file $m_url_file error."; my @t=<$f>; close $f; exit 1 if @t == 0 ; print "Read file ok.\n" if $DEBUG; chomp @t; $rurls=\@t; } else { $rurls=get_proxy_pub_urls($m_raw_uri); } dump $rurls if $DEBUG >=4; foreach (@$rurls) { next if exists($url_hist{$_}); my $rproxys=get_proxys($_); my @t = &test_proxys($rproxys); push @threads,@t; last if $exit_flag; } } dump("threads:\n",@threads) if $DEBUG >=4; foreach (@threads) { if($exit_flag) { $_->detach(); } else { $_->join(); } } } sub test_proxys #return @threads fun(\@proxys) { my $ref=shift @_; my %proxy_hist; my @threads; foreach (@$ref) { next if exists($proxy_hist{$_}); $proxy_hist{$_}=1; $m_sem->down(); print STDERR "testing $_ ...\n" if $DEBUG >2; my $t; if($m_proxy_type eq "http") { $t=threads->create("test_http_proxy",$_); } else { $t=threads->create("test_socks_proxy",$_); } push(@threads,$t) if $t; $exit_flag=1,last if $m_get_num >= $m_max_num; #如果找到的存活代理数目够了则退出 } return @threads; } sub filter_url { dump @_ if $DEBUG >=4 ; return 1 if @_ != 2; return 1 if $_[0] !~ /^http/; return 1 if $_[1] !~ /^http/; my $uri1=URI->new($_[0]); my $uri2=URI->new($_[1]); my $host1=$uri1->host; my $host2=$uri2->host; my $key1=substr($host1,index($host1,".")+1); my $key2=substr($host2,index($host2,".")+1); return 1 if $key1 eq $key2; return 1 if $host1 eq $key2; return 1 if $_->[2] =~ /search\?q=cache/; #google cache的标记 return 0; } sub get_proxy_pub_urls # return \@urls fun($search_site_uri) { my $search_uri=shift @_; my $ua= LWP::UserAgent->new; #网站交互界面 $ua->cookie_jar({}); $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windws NT 5.1)'); $ua->timeout($m_timeout*3); my $res_obj= $ua->get($search_uri); print STDERR "-->",$search_uri,"\n",$res_obj->status_line,"\n"; exit 1 if (! $res_obj->is_success()); my $html_parse = HTML::LinkExtor->new(); $html_parse->parse($res_obj->as_string); my @urls; foreach ($html_parse->links) { next if $_->[0] ne "a"; next if filter_url($search_uri,$_->[2]); push(@urls,$_->[2]); } return \@urls; } sub get_proxys #return \@proxys fun($url) { my $url=shift @_; my @proxys; my $ua= LWP::UserAgent->new; #网站交互界面 $ua->cookie_jar({}); $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windws NT 5.1)'); $ua->timeout($m_timeout*2); print STDERR "connecting ",$url," ...\n" if $DEBUG>=2; my $res_obj= $ua->get($url); print STDERR $url,"\t",$res_obj->status_line,"\n" if $DEBUG>1; next if (! $res_obj->is_success()); my $html=$res_obj->as_string; while($html =~ m/ (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) #IP \D+ (?new(); $ua->timeout($m_timeout); $ua->proxy("http","http://" . $_); my $res = $ua->get("http://" . $m_test_site); if($res->is_success()) { my $html=$res->as_string; if(length($html) > 1024 && $html =~ /html/i && \ ( $html =~ /table/i || $html =~ /div/i) ) { $m_get_num++; if($m_out_format eq "net-trans") { print "$_\@HTTP:::$m_get_num\n"; } else { print "OK $_\@HTTP\n"; } } } $m_sem->up(); } sub test_socks_proxy { return if @_ == 0; $_=shift @_; my ($proxy,$port)=split /:/,$_; my $socks = new IO::Socket::Socks(ProxyAddr=>$proxy, ProxyPort=>$port, ConnectAddr=>$m_test_site, ConnectPort=>$m_test_port, AuthType=>"none", Timeount=>$m_timeout, SocksDebug=>0, ); if (defined($socks)) { $m_get_num++; print "OK $proxy:$port\@Socks5\n"; $socks->close(); } $m_sem->up(); } sub usage() { print <\$m_test_site, "aim-port=s"=>\$m_test_port, "timeout=i"=>\$m_timeout, "threads=i"=>\$m_thread_num, "number=i"=>\$m_max_num, "type=s"=>\$m_proxy_type, "format=s"=>\$m_out_format, "debug=i"=>\$DEBUG, "key=s"=>\$m_key, "engine=s"=>\$engine, "search-uri=s"=>\$search_uri, "raw-search-uri=s"=>\$m_raw_uri, "load-urls=s"=>\$m_url_file, "load-proxys=s"=>\$m_proxy_file, "help"=>\$help, "version"=>\$help, ); dump($m_test_site,$m_test_port,$m_timeout,$m_thread_num, $m_max_num,$m_proxy_type,$m_out_format,$DEBUG,$m_key, $engine,$search_uri,$m_raw_uri,$m_url_file,$m_proxy_file ) if $DEBUG >=4 ; usage() if $help; if($m_max_num <=0 ) { print "ERROR: --number <=0\n",usage(); } if($m_timeout<=0 ) { print "ERROR: --timeout<=0\n",usage(); } if($m_thread_num<=0 ) { print "ERROR: --threads<=0\n",usage(); } if(length($m_test_site) < 3) { print "ERROR: --aim\n",usage(); } if($m_test_port <= 0) { print "ERROR: --aim-port\n",usage(); } if($m_out_format) { if($m_out_format ne "net-trans" ) { print "ERROR: --format\n",usage(); } } if($m_test_port <= 0) { print "ERROR: --aim-port\n",usage(); } if($m_proxy_type ne "http" && $m_proxy_type ne "socks5") { print "ERROR: --type\n",usage(); } if($engine ne "google" && $engine ne "baidu") { print "ERROR: --engine\n",usage(); } if(length($m_key) < 5) { print "ERROR: --key\n",usage(); } if($m_url_file) { if(! -r $m_url_file) { print "ERROR READ: --load-urls\n",usage(); } } if($m_proxy_file) { if(! -r $m_proxy_file) { print "ERROR READ: --load-proxys\n",usage(); } } if( ! defined($search_uri) || ! $search_uri) { $search_uri=$BAIDU if($engine eq "baidu"); $search_uri=$GOOGLE if($engine eq "google"); } if (! defined($m_raw_uri) || ! $m_raw_uri) { $m_raw_uri=$search_uri . uri_escape($m_key . " " . $m_proxy_type); } $m_raw_uri = "http://" . $m_raw_uri if $m_raw_uri !~ /^http/; } #EOF