lua 以表的方式传入一个list 循环访问表中全部网址 并抓去网页内容 怎...
发布网友
发布时间:2024-01-18 11:00
我来回答
共1个回答
热心网友
时间:2024-07-28 03:50
将以下代码保存为fetchhtml.lua:
#!/usr/bin/env lua
socket = require "socket"
-- download() 用来抓取网页内容
function download (host, file)
local c = assert(socket.connect(host, 80))
local count = 0
local h = ""
-- counts number of bytes read
c:send("GET " .. file .. " HTTP/1.0\r\n\r\n")
while true do
local s, status = receive(c)
h = h .. s
count = count + #s
if status == "closed" then
html_page[host .. file] = h
break
end
end
c:close()
print(host, file, count)
end
-- receive() 向远端服务器一次请求接收1k字节网页内容
function receive (connection)
connection:settimeout(0.5)
-- do not block
local s, status, partial = connection:receive(2^10)
if status == "timeout" then
coroutine.yield(connection)
end
return s or partial, status
end
-- 采用非抢占多任务机制,避免多个下载任务之间出现阻塞
threads = {}
function get (host, file)
-- create coroutine
local co = coroutine.create(function ()
download(host, file)
end)
-- insert it in the list
table.insert(threads, co)
end
function dispatch ()
local i = 1
while true do
if threads[i] == nil then
if threads[1] == nil then
break
end
i = 1
end
local status, res = coroutine.resume(threads[i])
if not res then
table.remove(threads, i)
else
i = i + 1
end
end
end
-- 程序主体
-- 打开并读取文件,构造list
f = assert(io.open("html_list.txt", "r"))
lines = {}
for line in f:lines() do
lines[#lines + 1] = line
end
f:close()
-- 循环访问表中全部网址,并获取网页内容
html_page = {}
for i, l in ipairs(lines) do
print(i, l)
host = l
file = "/index.html"
get(host,file)
end
dispatch()
-- 获取结果打印出来
for k,v in pairs(html_page) do
print(k)
print(v)
end
编辑文本文件html_list.txt:
www.lua.org
www.debian.org
测试结果:
moose@debian:~/Code/baidu_knowledge/lua_fetchhtml$ ./fetchhtml.lua
1 www.lua.org
2 www.debian.org
www.debian.org /index.html 774
www.lua.org /index.html 591
www.debian.org/index.html
HTTP/1.1 200 OK
Date: Sun, 29 Mar 2015 09:00:27 GMT
Server: Apache
Last-Modified: Wed, 17 Apr 2013 12:05:34 GMT
ETag: "369d5b7f-1d6-4da8d4d9c01c7"
Accept-Ranges: bytes
Content-Length: 470
Vary: Accept-Encoding
X-Clacks-Overhead: GNU Terry Pratchett
Connection: close
Content-Type: text/html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<TITLE>Welcome to senfter!</TITLE>
</HEAD>
<BODY>
<H1>Welcome to senfter!</H1>
This is senfter, a system run by and for the <a href="http://www.debian.org/">Debian Project</a>.
She does stuff.
What kind of stuff and who our kind sponsors are you might learn on
<a href="http://db.debian.org/machines.cgi?host=senfter">db.debian.org</a>.
<P>
<HR NOSHADE />
<FONT size="-1">DSA</FONT>
</BODY>
</HTML>
www.lua.org/index.html
HTTP/1.0 302 Moved Temporarily
Connection: close
Date: Sun, 29 Mar 2015 09:00:26 GMT
Server: Cherokee
Location: http://www./index.html
Content-Type: text/plain
URI: http://www./index.html
Content-Length: 348
Content-Type: text/html
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html>
<head><title>302 Moved Temporarily</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
<h1>302 Moved Temporarily</h1>
The document has moved <a href="http://www./index.html">here</a>.
<p><hr>
Cherokee web server, Port 80
</body>
</html>
有关知识点参考《Programming in Lua》第三版中以下章节:
-- 5 Functions
-- 9.4 Non-Preemptive Multithreading
-- 22.1 The Simple I/O Model
-- 22.2 The Complete I/O Model