PHP 获取网页内容的几个函数
内容摘要
这篇文章主要为大家详细介绍了PHP 获取网页内容的几个函数,具有一定的参考价值,可以用来参考一下。
稍微有点意义的函数是:get_content_by_socket(), get_url(), get_content_
稍微有点意义的函数是:get_content_by_socket(), get_url(), get_content_
文章正文
这篇文章主要为大家详细介绍了PHP 获取网页内容的几个函数,具有一定的参考价值,可以用来参考一下。
稍微有点意义的函数是:get_content_by_socket(), get_url(), get_content_url(), get_content_object 几个函数,也许能够给你点什么想法。经测试代码如下:1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | <code class = "php" > <?php //获取所有内容url保存到文件 function get_index( $save_file , $prefix = "index_" ){ $count = 68; $i = 1; if ( file_exists ( $save_file )) @unlink( $save_file ); $fp = fopen ( $save_file , "a+" ) or die ( "Open " . $save_file . " failed" ); while ( $i < $count ){ $url = $prefix . $i . ".htm" ; echo "Get " . $url . "..." ; $url_str = get_content_url(get_url( $url )); echo " OKn" ; fwrite( $fp , $url_str ); ++ $i ; } fclose( $fp ); } //获取目标多媒体对象 function get_object( $url_file , $save_file , $split = "|--:**:--|" ){ if (! file_exists ( $url_file )) die ( $url_file . " not exist" ); $file_arr = file( $url_file ); if (! is_array ( $file_arr ) || empty ( $file_arr )) die ( $url_file . " not content" ); $url_arr = array_unique ( $file_arr ); if ( file_exists ( $save_file )) @unlink( $save_file ); $fp = fopen ( $save_file , "a+" ) or die ( "Open save file " . $save_file . " failed" ); foreach ( $url_arr as $url ){ if ( empty ( $url )) continue ; echo "Get " . $url . "..." ; $html_str = get_url( $url ); echo $html_str ; echo $url ; exit ; $obj_str = get_content_object( $html_str ); echo " OKn" ; fwrite( $fp , $obj_str ); } fclose( $fp ); } //遍历目录获取文件内容 function get_dir( $save_file , $dir ){ $dp = opendir( $dir ); if ( file_exists ( $save_file )) @unlink( $save_file ); $fp = fopen ( $save_file , "a+" ) or die ( "Open save file " . $save_file . " failed" ); while (( $file = readdir( $dp )) != false){ if ( $file != "." && $file != ".." ){ echo "Read file " . $file . "..." ; $file_content = file_get_contents ( $dir . $file ); $obj_str = get_content_object( $file_content ); echo " OKn" ; fwrite( $fp , $obj_str ); } } fclose( $fp ); } //获取指定url内容 function get_url( $url ){ $reg = '/^http:\/\/[^\/].+$/' ; if (!preg_match( $reg , $url )) die ( $url . " invalid" ); $fp = fopen ( $url , "r" ) or die ( "Open url: " . $url . " failed." ); while ( $fc = fread ( $fp , 8192)){ $content .= $fc ; } fclose( $fp ); if ( empty ( $content )){ die ( "Get url: " . $url . " content failed." ); } return $content ; } //使用socket获取指定网页 function get_content_by_socket( $url , $host ){ $fp = fsockopen ( $host , 80) or die ( "Open " . $url . " failed" ); $header = "GET /" . $url . " HTTP/1.1rn" ; $header .= "Accept: */*rn" ; $header .= "Accept-Language: zh-cnrn" ; $header .= "Accept-Encoding: gzip, deflatern" ; $header .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; InfoPath.1; .NET CLR 2.0.50727)rn" ; $header .= "Host: " . $host . "rn" ; $header .= "Connection: Keep-Alivern" ; //$header .= "Cookie: cnzz02=2; rtime=1; ltime=1148456424859; cnzz_eid=56601755-rnrn"; $header .= "Connection: Closernrn" ; fwrite( $fp , $header ); while (! feof ( $fp )) { $contents .= fgets ( $fp , 8192); } fclose( $fp ); return $contents ; } //获取指定内容里的url function get_content_url( $host_url , $file_contents ){ //$reg = '/^(#|javascript.*?|ftp://.+|http://.+|.*?href.*?|play.*?|index.*?|.*?asp)+$/i'; //$reg = '/^(down.*?.html|d+_d+.htm.*?)$/i'; $rex = "/([hH][rR][eE][Ff])s*=s*['" ]*([^>' "s]+)[" '>]*s*/i"; $reg = '/^(down.*?.html)$/i' ; preg_match_all ( $rex , $file_contents , $r ); $result = "" ; //array(); foreach ( $r as $c ){ if ( is_array ( $c )){ foreach ( $c as $d ){ if (preg_match( $reg , $d )){ $result .= $host_url . $d . "n" ; } } } } return $result ; } //获取指定内容中的多媒体文件 function get_content_object( $str , $split = "|--:**:--|" ){ $regx = "/hrefs*=s*['" ]*([^>' "s]+)[" '>]*s*(<b>.*?</b>)/i"; preg_match_all( $regx , $str , $result ); if ( count ( $result ) == 3){ $result [2] = str_replace ( "<b>多媒体: " , "" , $result [2]); $result [2] = str_replace ( "</b>" , "" , $result [2]); $result = $result [1][0] . $split . $result [2][0] . "n" ; } return $result ; } ?> /*** 代码来自php教程(www.idcnote.com) ***/ </code> |
注:关于PHP 获取网页内容的几个函数的内容就先介绍到这里,更多相关文章的可以留意
代码注释