import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Pages
(id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
error INTEGER, old_rank REAL, new_rank REAL)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Links
(from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')
cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
# Check to see if we are already in progress...
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
row = cur.fetchone()
if row is not None:
print("Restarting existing crawl. Remove spider.sqlite to start a fresh crawl.")
else :
starturl = input('Enter web url or enter: ')
if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/'
if ( starturl.endswith('/') ) : starturl = starturl[:-1]
web = starturl
if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
pos = starturl.rfind('/')
web = starturl[:pos]
if ( len(web) > 1 ) :
cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
conn.commit()
# Get the current webs
cur.execute('''SELECT url FROM Webs''')
webs = list()
for row in cur:
webs.append(str(row[0]))
print(webs)
many = 0
while True:
if ( many < 1 ) :
sval = input('How many pages:')
if ( len(sval) < 1 ) : break
many = int(sval)
many = many - 1
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
try:
row = cur.fetchone()
# print row
fromid = row[0]
url = row[1]
except:
print('No unretrieved HTML pages found')
many = 0
break
print(fromid, url, end=' ')
# If we are retrieving this page, there should be no links from it
cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
try:
document = urlopen(url, context=ctx)
html = document.read()
if document.getcode() != 200 :
print("Error on page: ",document.getcode())
cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
if 'text/html' != document.info().get_content_type() :
print("Ignore non text/html page")
cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) )
conn.commit()
continue
print('('+str(len(html))+')', end=' ')
soup = BeautifulSoup(html, "html.parser")
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except:
print("Unable to retrieve or parse page")
cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
conn.commit()
continue
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
conn.commit()
# Retrieve all of the anchor tags
tags = soup('a')
count = 0
for tag in tags:
href = tag.get('href', None)
if ( href is None ) : continue
# Resolve relative references like href="/contact"
up = urlparse(href)
if ( len(up.scheme) < 1 ) :
href = urljoin(url, href)
ipos = href.find('#')
if ( ipos > 1 ) : href = href[:ipos]
if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
if ( href.endswith('/') ) : href = href[:-1]
# print href
if ( len(href) < 1 ) : continue
# Check if the URL is in any of the webs
found = False
for web in webs:
if ( href.startswith(web) ) :
found = True
break
if not found : continue
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
count = count + 1
conn.commit()
cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
try:
row = cur.fetchone()
toid = row[0]
except:
print('Could not retrieve id')
continue
# print fromid, toid
cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
print(count)
cur.close()
“The amphibian!” cried Larry. “I wonder why——” “I did not see her,” Miss Serena replied to Sandy while she answered the older man’s question in the same breath. “But I saw a glimpse of dress just afterward.” Her expression showed confident assurance. “It’s time to find out what’s what!” he muttered. "De veras?" asked Cairness, sharply. He was of no mind to lose her like this, when he was so near his end. With his heart full of hope and joy, the Deacon bustled around to make every possible preparation for the journey. "We do," responded those kneeling at the altar. "Silence, Sergeant. Billings? Billings? The name of the Lieutenant-Colonel of the 200th Ind. happens to be McBiddle—one-armed man, good soldier. Billings? Billings? T. J. Billings? Is that your name?" "Pete," said Shorty solemnly as he finished trimming the switch, and replaced the knife in his pocket, "nobody's allowed to pick out his own daddy in this world. He just gits him. It's one o' the mysterious ways o' Providence. You've got me through one o' them mysterious ways o' Providence, and you can't git shet o' me. I'm goin' to lick you still harder for swearin' before your father, and sayin' disrespeckful words to him. And I'm goin' to lick you till you promise never to tech another card until I learn you you how to play, which'll be never. Come here, my son." "Leave me alone," Dodd said. "Just do me a favor. Leave me alone." "I cud m?ake something out of Boarzell." Should beauty forget now their nests have grown cold? Makes boil the rushing blood and thrills my very soul." "Because you gave those things up of your free will—they were made to give them up by force. You've no right to starve and deny other people as you have to starve and deny yourself." He rose to his feet. The kitchen was dark, with eddying sweeps of shadow in the corners which the firelight caressed—while a single star put faint ghostly romance into the window. "Oh, mother!" shrieked Margaret. "Fly!—to the abbey, and take sanctuary!" HoME大陆明星露点电影片段合集
ENTER NUMBET 0017
www.lete1.com.cn
www.yijianba.com.cn
sizu9.com.cn
www.fm35.com.cn
www.hxpq.com.cn
www.shina5.net.cn
laguo2.com.cn
www.yaoze1.net.cn
www.maofu7.net.cn
mentu7.com.cn