11/29/2019

download zip file from url, python sample code




from bs4 import BeautifulSoup
import requests
import os
import sys

def downloadZip(url, prefix_url, outpath):
mbyte=1024*1024
html = requests.get(url).text
soup = BeautifulSoup(html, features='lxml')


for name in soup.findAll('a', href=True):
#find A tag
zipurl = name['href']
#find file extension
if( zipurl.endswith('.zip') ):
#make download path
outfname = outpath +'/'+ zipurl.split('/')[-1]
#make url
zipurl = prefix_url+zipurl #http://aaa.com/ + 'abc.zip'
print(zipurl)
r = requests.get(zipurl, stream=True)
if( r.status_code == requests.codes.ok ) :
fsize = int(r.headers['content-length'])
print('Downloading %s (%sMb)'%(outfname, fsize/mbyte))
with open(outfname, 'wb') as fd:
for chunk in r.iter_content(chunk_size=1024): # chuck size can be larger
if chunk: # ignore keep-alive requests
fd.write(chunk)
fd.close()

base_path = os.getcwd()
path_join = os.path.join(base_path, 'data_download_pdf')
sys.path.append(path_join)

# point to output directory
outpath = path_join
url = 'https://www.gsa.gov/real-estate/real-estate-services/leasing-policy-procedures/lease-documents/lease-documents-region-1-new-england/'
prefix_url = 'https://www.gsa.gov/cdnstatic'

downloadZip(url, prefix_url, outpath)








11/24/2019

python website url verification code


import requests
import urllib.request

url = 'http://www.marearts.com'

try:
resp = requests.get(url, verify=False)
print(url, resp.status_code)
except:
print('fail to access')

try:
resp = urllib.request.urlopen(url)
print(url, resp.getcode())
except:
print('fail to access')

11/20/2019

All configured authentication methods failed -> vscode sftp setting

Try to use this script on sftp.json file.

{
"name": "test",
"protocol": "sftp",
"host": "ec2-114-24-120-84.eu-west-4.compute.amazonaws.com",
"remotePath": "/home/ubuntu/ABCD",
"privateKeyPath": "/Users/ABD/ec2_aws.pem",
"username": "ubuntu",
"port": 22,
"secure": true,
"uploadOnSave": true,
"passive": false,
"debug": true,
"ignore": [
"\\.vscode",
"\\.git",
"\\.DS_Store"
],
"generatedFiles": {
"uploadOnSave": true,
"extensionsToInclude": [],
"path": "./"
}
}

11/16/2019

yum install -y https://centos7.iuscommunity.org/ius-release.rpm -> yum install -y https://centos7.iuscommunity.org/ius-release.rpm

yum install -y https://centos7.iuscommunity.org/ius-release.rpm
but when you get this error :

Error:
 Problem: conflicting requests
  - nothing provides epel-release = 7 needed by ius-release-2-1.el7.ius.noarch
(try to add '--skip-broken' to skip uninstallable packages or '--nobest' to use not only best candidate packages)


Try to install this one first.

yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
then
yum install -y https://centos7.iuscommunity.org/ius-release.rpm


Thank you.

yum install unxz -> Error: Nothing to do

use this one:

yum install xz

yum install wkhtmltopdf -> No package wkhtmltopdf available.


wget https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.4/wkhtmltox-0.12.4_linux-generic-amd64.tar.xz
unxz wkhtmltox-0.12.4_linux-generic-amd64.tar.xz
tar -xvf wkhtmltox-0.12.4_linux-generic-amd64.tar
mv wkhtmltox/bin/* /usr/local/bin/
rm -rf wkhtmltox
rm -f wkhtmltox-0.12.4_linux-generic-amd64.tar



reference :
https://github.com/JazzCore/python-pdfkit
https://gist.github.com/paulsturgess/cfe1a59c7c03f1504c879d45787699f5
https://gist.github.com/AndreasFurster/ebe3f163d6d47be43b72b35b18d8b5b6

11/15/2019

rect intersector check using image (python)



1. make image as numpy
2. add 1 all element for each inside of box
3. check if there is bigger element value than 1

Thank you.


#draw new image
img = np.zeros((int(height), int(width), 1), dtype = "uint8")
#draw new image
img.fill(0)
for i, v in enumerate(box_list):
left = int(v['bbox'][0])
top = int(v['bbox'][1])
right = int(v['bbox'][2])
bottom = int(v['bbox'][3])
img[top:bottom,left:right] += 1

#check overlap
imgv = img.reshape(height*width)
if sum(imgv > 1) > 0:
print('intersector found')

11/13/2019

PDF split page and save each page to pdf, python

#pip3 install PyMuPDF
import fitz
doc = fitz.open('./test2.pdf'

#page_number = doc.pageCount
#print(page_number)

#split pages
for i, page in enumerate(doc.pages()):
    print(i)
    doc2 = fitz.open()               
    doc2.insertPDF(doc, to_page = i) 
    doc2.save("{}.pdf".format(i))