from bs4 import BeautifulSoup
import requests
import os
import sys
def downloadZip(url, prefix_url, outpath):
    mbyte=1024*1024
    html = requests.get(url).text
    soup = BeautifulSoup(html, features='lxml')
    for name in soup.findAll('a', href=True):
        #find A tag
        zipurl = name['href']
        #find file extension
        if( zipurl.endswith('.zip') ):
            #make download path
            outfname = outpath +'/'+ zipurl.split('/')[-1]
            #make url
            zipurl = prefix_url+zipurl #http://aaa.com/ + 'abc.zip'
            print(zipurl)
            r = requests.get(zipurl, stream=True)
            if( r.status_code == requests.codes.ok ) :
                fsize = int(r.headers['content-length'])
                print('Downloading %s (%sMb)'%(outfname, fsize/mbyte))
                with open(outfname, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=1024): # chuck size can be larger
                        if chunk: # ignore keep-alive requests
                            fd.write(chunk)
                    fd.close()
base_path = os.getcwd()
path_join = os.path.join(base_path, 'data_download_pdf')
sys.path.append(path_join)
# point to output directory
outpath = path_join
url = 'https://www.gsa.gov/real-estate/real-estate-services/leasing-policy-procedures/lease-documents/lease-documents-region-1-new-england/'
prefix_url = 'https://www.gsa.gov/cdnstatic'
downloadZip(url, prefix_url, outpath)

 
 
 
 
No comments:
Post a Comment