MareArts Computer Vision Study.: 2019.11

11/29/2019

download zip file from url, python sample code

from bs4 import BeautifulSoup
import requests
import os
import sys

def downloadZip(url, prefix_url, outpath):
    mbyte=1024*1024
    html = requests.get(url).text
    soup = BeautifulSoup(html, features='lxml')


    for name in soup.findAll('a', href=True):
        #find A tag
        zipurl = name['href']
        #find file extension
        if( zipurl.endswith('.zip') ):
            #make download path
            outfname = outpath +'/'+ zipurl.split('/')[-1]
            #make url
            zipurl = prefix_url+zipurl #http://aaa.com/ + 'abc.zip'
            print(zipurl)
            r = requests.get(zipurl, stream=True)
            if( r.status_code == requests.codes.ok ) :
                fsize = int(r.headers['content-length'])
                print('Downloading %s (%sMb)'%(outfname, fsize/mbyte))
                with open(outfname, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=1024): # chuck size can be larger
                        if chunk: # ignore keep-alive requests
                            fd.write(chunk)
                    fd.close()

base_path = os.getcwd()
path_join = os.path.join(base_path, 'data_download_pdf')
sys.path.append(path_join)

# point to output directory
outpath = path_join
url = 'https://www.gsa.gov/real-estate/real-estate-services/leasing-policy-procedures/lease-documents/lease-documents-region-1-new-england/'
prefix_url = 'https://www.gsa.gov/cdnstatic'

downloadZip(url, prefix_url, outpath)

11/24/2019

python website url verification code

    
import requests
import urllib.request

url = 'http://www.marearts.com' 

try:
    resp = requests.get(url, verify=False)
    print(url, resp.status_code) 
except:
    print('fail to access')

try:
    resp = urllib.request.urlopen(url)
    print(url, resp.getcode())
except:
    print('fail to access')

11/20/2019

All configured authentication methods failed -> vscode sftp setting

Try to use this script on sftp.json file.

{
    "name": "test",
    "protocol": "sftp",
    "host": "ec2-114-24-120-84.eu-west-4.compute.amazonaws.com",
    "remotePath": "/home/ubuntu/ABCD",
    "privateKeyPath": "/Users/ABD/ec2_aws.pem",
    "username": "ubuntu",
    "port": 22,
    "secure": true,
    "uploadOnSave": true,
    "passive": false,
    "debug": true,
    "ignore": [
        "\\.vscode",
        "\\.git",
        "\\.DS_Store"
    ],
    "generatedFiles": {
        "uploadOnSave": true,
        "extensionsToInclude": [],
        "path": "./"
    }
}

11/16/2019

yum install -y https://centos7.iuscommunity.org/ius-release.rpm -> yum install -y https://centos7.iuscommunity.org/ius-release.rpm

yum install -y https://centos7.iuscommunity.org/ius-release.rpm

but when you get this error :

Error:
Problem: conflicting requests
- nothing provides epel-release = 7 needed by ius-release-2-1.el7.ius.noarch
(try to add '--skip-broken' to skip uninstallable packages or '--nobest' to use not only best candidate packages)

Try to install this one first.

yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm

then

yum install -y https://centos7.iuscommunity.org/ius-release.rpm

Thank you.

yum install unxz -> Error: Nothing to do

use this one:

yum install xz

yum install wkhtmltopdf -> No package wkhtmltopdf available.

wget https://github.com/wkhtmltopdf/wkhtmltopdf/releases/download/0.12.4/wkhtmltox-0.12.4_linux-generic-amd64.tar.xz
unxz wkhtmltox-0.12.4_linux-generic-amd64.tar.xz
tar -xvf wkhtmltox-0.12.4_linux-generic-amd64.tar
mv wkhtmltox/bin/* /usr/local/bin/
rm -rf wkhtmltox
rm -f wkhtmltox-0.12.4_linux-generic-amd64.tar

reference :
https://github.com/JazzCore/python-pdfkit
https://gist.github.com/paulsturgess/cfe1a59c7c03f1504c879d45787699f5
https://gist.github.com/AndreasFurster/ebe3f163d6d47be43b72b35b18d8b5b6

11/15/2019

rect intersector check using image (python)

1. make image as numpy
2. add 1 all element for each inside of box
3. check if there is bigger element value than 1

Thank you.

#draw new image        
img = np.zeros((int(height), int(width), 1), dtype = "uint8")
#draw new image
img.fill(0)
for i, v in enumerate(box_list):
left = int(v['bbox'][0])
top = int(v['bbox'][1])
right = int(v['bbox'][2])
bottom = int(v['bbox'][3])
img[top:bottom,left:right] += 1

#check overlap
imgv = img.reshape(height*width)
if sum(imgv > 1) > 0:
       print('intersector found')

11/13/2019

PDF split page and save each page to pdf, python

#manual https://pymupdf.readthedocs.io/en/latest/

#pip3 install PyMuPDF

import fitz

doc = fitz.open('./test2.pdf')

#page_number = doc.pageCount

#print(page_number)

#split pages

for i, page in enumerate(doc.pages()):

print(i)

doc2 = fitz.open()

doc2.insertPDF(doc, to_page = i)

doc2.save("{}.pdf".format(i))

Pages