210821 - TIL

210821 - TIL

# 한 것

## 3개의 사이트에서 검색한 단어에 대한 job scrapper 만들기(python, flask)

job.title job.company job.link

# 배운 것

## getting the second tag of a type

secondtable = soup.findAll('table')[1]

https://stackoverflow.com/questions/14095511/beautifulsoup-in-python-getting-the-n-th-tag-of-a-type

## concatenate lists

listone = [1,2,3] listtwo = [4,5,6] joinedlist = listone + listtwo >>> joinedlist [1,2,3,4,5,6]

https://stackoverflow.com/questions/1720421/how-do-i-concatenate-two-lists-in-python

## flask.redirect()

Returns a response object (a WSGI application) that, if called, redirects the client to the target location.

https://flask.palletsprojects.com/en/2.0.x/api/#flask.redirect

## csv

The so-called CSV (Comma Separated Values) format is the most common import and export format for spreadsheets and databases.

https://docs.python.org/3/library/csv.html

## csv.writer()

Return a writer object responsible for converting the user’s data into delimited strings on the given file-like object. csvfile can be any object with a write() method.

https://docs.python.org/3/library/csv.html#csv.writer

## writer.writerow

import csv def save_to_file(jobs): file = open("jobs.csv", mode="w") writer = csv.writer(file) writer.writerow(["Title", "Company", "Link"]) for job in jobs: writer.writerow(list(job.values())) file.close()

https://docs.python.org/3/library/csv.html#csv.csvwriter.writerow

## flask.send_file()

Send the contents of a file to the client.

@app.route('/return-files/') def return_files_tut(): try: return send_file('/var/www/PythonProgramming/PythonProgramming/static/ohhey.pdf', attachment_filename='ohhey.pdf') except Exception as e: return str(e)

https://flask.palletsprojects.com/en/2.0.x/api/#flask.send_file

https://pythonprogramming.net/flask-send-file-tutorial/

## python dictionary values()

values() is an inbuilt method in Python programming language that returns a list of all the values available in a given dictionary.

marks = {'Physics':67, 'Maths':87} print(marks.values()) # Output: dict_values([67, 87])

https://www.geeksforgeeks.org/python-dictionary-values/

https://www.programiz.com/python-programming/methods/dictionary/values

## python list()

The list() constructor returns a list in Python.

list([iterable])

text = 'Python' # convert string to list text_list = list(text) print(text_list) # check type of text_list print(type(text_list)) # Output: ['P', 'y', 't', 'h', 'o', 'n'] #

https://www.programiz.com/python-programming/methods/built-in/list

✅ 작성한 코드

#main.py import wwr import remoteok import so from flask import Flask, render_template, request, redirect, send_file from save import save_to_file db = {} app = Flask("JobScrapper") @app.route("/export") def export(): try: word = request.args.get("word") print(word) if not word: raise Exception() word = word.lower() jobs = db.get(word) if not jobs: raise Exception() save_to_file(jobs) return send_file("jobs.csv", attachment_filename='jobs.csv', as_attachment=True) except: return redirect("/") def get_jobs(word): if db.get(word): return db[word] jobs = [] try: jobs += wwr.get_jobs(word) jobs += so.get_jobs(word) jobs += remoteok.get_jobs(word) except: pass db[word] = jobs return jobs @app.route("/search") def seach_jobs(): word = request.args.get("term") if word: word = word.lower() jobs = get_jobs(word) else : return redirect("/") return render_template("search.html", word=word, count=len(jobs), jobs=jobs) @app.route("/") def home(): return render_template("home.html") app.run(host="0.0.0.0")

# wwr.py(weworkremotely) import requests from bs4 import BeautifulSoup def get_jobs(word): url = f"https://weworkremotely.com/remote-jobs/search?term={word}" headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'} r_wwr = requests.get(url, headers=headers) soup_wwr = BeautifulSoup(r_wwr.text, "html.parser") job_sections = soup_wwr.find("div", {"class":"jobs-container"}).find_all("section", {"class":"jobs"}) jobs = [] for section in job_sections: job_lis = section.find("ul").find_all("li") for job_li in job_lis: """ if job_li["class"]: if job_li["class"][0] == "view-all": continue """ try: job_a = job_li.find_all("a", href=True)[1] except: continue job_title = job_a.find("span", {"class":"title"}).get_text() job_company = job_a.find("span", {"class":"company"}).get_text() job_link = f"https://weworkremotely.com{job_a['href']}" jobs.append({"title":job_title, "company":job_company, "link":job_link}) return jobs def init(): get_jobs("python")

so.py(stackoverflow) import requests from bs4 import BeautifulSoup def get_last_page(url): result = requests.get(url) soup = BeautifulSoup(result.text, "html.parser") pagination = soup.find("div", {"class":"s-pagination"}) pages = pagination.find_all('a') last_page = int(pages[-2].get_text(strip=True)) if last_page > 5: return 5 return last_page def extract_job(html): fl1 = html.find("div", {"class":"fl1"}) title = fl1.find('a')["title"] mb4 = fl1.find("h3", {"class":"mb4"}) company = mb4.find('span').string if company: company = company.strip() else : company = "None" #location = mb4.find("span", {"class":"fc-black-500"}).string #location = location.strip() job_id = html["data-jobid"] return { 'title': title, 'company': company, #'location': location, 'link': f"https://stackoverflow.com/jobs/{job_id}" } def extract_jobs(last_page, url): jobs = [] for page in range(last_page): #print(f"Scrapping SO: Page: {page+1}") result = requests.get(f"{url}&pg;={page+1}") soup = BeautifulSoup(result.text, "html.parser") results = soup.find_all("div", {"class":"-job"}) for result in results: job = extract_job(result) jobs.append(job) return jobs def get_jobs(word): #url = f"https://stackoverflow.com/jobs?q={word}" url = f"https://stackoverflow.com/jobs?r=true&q;={word}" last_page = get_last_page(url) jobs = extract_jobs(last_page, url) return jobs def init(): get_jobs("python")

# remoteok.py import requests from bs4 import BeautifulSoup def get_jobs(word): url = f"https://remoteok.io/remote-dev+{word}-jobs" headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'} r_rmt = requests.get(url, headers=headers) soup_rmt = BeautifulSoup(r_rmt.text, "html.parser") jobs_rmt = soup_rmt.find("table", {"id":"jobsboard"}).find_all("tr", {"class":"job"}) jobs = [] for job in jobs_rmt: job_company = job["data-company"] job_link = f"https://remoteok.io{job['data-href']}" job_title = job.find("td", {"class":"position"}).find("h2", {"itemprop":"title"}).get_text() jobs.append({"title":job_title, "company":job_company, "link":job_link}) return jobs def init(): get_jobs("python")

In templates

Job Scrapper Job Scrapper Find remote jobs! Search by term: Find my job

Job Scrapper Job Scrapper {{count}} {{word}} jobs found. Export to CSV ← Go back {% for job in jobs %} {{job.title}} {{job.company}} Apply {% endfor %}

from http://refigo.tistory.com/28 by ccl(A) rewrite - 2021-08-22 17:26:36