11/29/2019

download zip file from url, python sample code




from bs4 import BeautifulSoup
import requests
import os
import sys

def downloadZip(url, prefix_url, outpath):
mbyte=1024*1024
html = requests.get(url).text
soup = BeautifulSoup(html, features='lxml')


for name in soup.findAll('a', href=True):
#find A tag
zipurl = name['href']
#find file extension
if( zipurl.endswith('.zip') ):
#make download path
outfname = outpath +'/'+ zipurl.split('/')[-1]
#make url
zipurl = prefix_url+zipurl #http://aaa.com/ + 'abc.zip'
print(zipurl)
r = requests.get(zipurl, stream=True)
if( r.status_code == requests.codes.ok ) :
fsize = int(r.headers['content-length'])
print('Downloading %s (%sMb)'%(outfname, fsize/mbyte))
with open(outfname, 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024): # chuck size can be larger
if chunk: # ignore keep-alive requests
fd.write(chunk)
fd.close()

base_path = os.getcwd()
path_join = os.path.join(base_path, 'data_download_pdf')
sys.path.append(path_join)

# point to output directory
outpath = path_join
url = 'https://www.gsa.gov/real-estate/real-estate-services/leasing-policy-procedures/lease-documents/lease-documents-region-1-new-england/'
prefix_url = 'https://www.gsa.gov/cdnstatic'

downloadZip(url, prefix_url, outpath)








11/24/2019

python website url verification code


import requests
import urllib.request

url = 'http://www.marearts.com'

try:
resp = requests.get(url, verify=False)
print(url, resp.status_code)
except:
print('fail to access')

try:
resp = urllib.request.urlopen(url)
print(url, resp.getcode())
except:
print('fail to access')