Skip to content

Commit

Permalink
piloto para obtener strikes
Browse files Browse the repository at this point in the history
  • Loading branch information
s-nt-s committed Jan 27, 2021
1 parent f93ec95 commit 6d006a4
Show file tree
Hide file tree
Showing 2 changed files with 332 additions and 27 deletions.
229 changes: 229 additions & 0 deletions core/web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import re
from urllib.parse import urljoin

import bs4
import requests
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait

re_sp = re.compile(r"\s+")

default_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0',
"Cache-Control": "no-cache",
"Pragma": "no-cache",
"Expires": "Thu, 01 Jan 1970 00:00:00 GMT",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}


def buildSoup(root, source):
soup = bs4.BeautifulSoup(source, "lxml")
for n in soup.findAll(["img", "form", "a", "iframe", "frame", "link", "script"]):
attr = "href" if n.name in ("a", "link") else "src"
if n.name == "form":
attr = "action"
val = n.attrs.get(attr)
if val and not (val.startswith("#") or val.startswith("javascript:")):
val = urljoin(root, val)
n.attrs[attr] = val
return soup


class Web:
def __init__(self, refer=None, verify=True):
self.s = requests.Session()
self.s.headers = default_headers
self.response = None
self.soup = None
self.form = None
self.refer = refer
self.verify = verify

def get(self, url, **kargv):
if self.refer:
self.s.headers.update({'referer': self.refer})
if kargv:
self.response = self.s.post(url, data=kargv, verify=self.verify)
else:
self.response = self.s.get(url, verify=self.verify)
self.refer = self.response.url
self.soup = bs4.BeautifulSoup(self.response.content, "lxml")
for n in self.soup.findAll(["img", "form", "a", "iframe", "frame", "link", "script"]):
attr = "href" if n.name in ("a", "link") else "src"
if n.name == "form":
attr = "action"
val = n.attrs.get(attr)
if val and not (val.startswith("#") or val.startswith("javascript:")):
val = urljoin(url, val)
n.attrs[attr] = val
return self.soup

def prepare_submit(self, slc, silent_in_fail=False, **kargv):
data = {}
self.form = self.soup.select_one(slc)
if silent_in_fail and self.form is None:
return None, None
for i in self.form.select("input[name]"):
name = i.attrs["name"]
data[name] = i.attrs.get("value")
for i in self.form.select("select[name]"):
name = i.attrs["name"]
slc = i.select_one("option[selected]")
slc = slc.attrs.get("value") if slc else None
data[name] = slc
data = {**data, **kargv}
action = self.form.attrs.get("action")
action = action.rstrip() if action else None
if action is None:
action = self.response.url
return action, data

def submit(self, slc, silent_in_fail=False, **kargv):
action, data = self.prepare_submit(
slc, silent_in_fail=silent_in_fail, **kargv)
if silent_in_fail and not action:
return None
return self.get(action, **data)

def val(self, slc):
n = self.soup.select_one(slc)
if n is None:
return None
v = n.attrs.get("value", n.get_text())
v = v.strip()
return v if v else None

def resolve(self, url, **kargv):
if self.refer:
self.s.headers.update({'referer': self.refer})
if kargv:
r = self.s.post(url, data=kargv, verify=self.verify, allow_redirects=False)
else:
r = self.s.get(url, verify=self.verify, allow_redirects=False)
if r.status_code in (302, 301):
return r.headers['location']

default_profile = {
"browser.tabs.drawInTitlebar": True,
"browser.uidensity": 1,
}


class FF:
def __init__(self, visible=False, wait=60):
self._driver = None
self.visible = visible
self._wait = wait

@property
def driver(self):
if self._driver is None:
options = Options()
options.headless = not self.visible
profile = webdriver.FirefoxProfile()
for k, v in default_profile.items():
profile.set_preference(k, v)
profile.DEFAULT_PREFERENCES['frozen'][k] = v
profile.update_preferences()
self._driver = webdriver.Firefox(
options=options, firefox_profile=profile)
self._driver.maximize_window()
self._driver.implicitly_wait(5)
return self._driver

def close(self):
if self._driver:
self._driver.close()
self._driver = None

def reintentar(self, intentos, sleep=1):
if intentos > 50:
return False, sleep
if intentos % 3 == 0:
sleep = int(sleep / 3)
self.close()
else:
sleep = sleep*2
if intentos > 20:
time.sleep(10)
time.sleep(2 * (int(intentos/10)+1))
return True, sleep

def get(self, url):
self._soup = None
self.driver.get(url)

def get_soup(self):
if self._driver is None:
return None
return buildSoup(self._driver.current_url, self._driver.page_source)

@property
def source(self):
if self._driver is None:
return None
return self._driver.page_source

def wait(self, id, seconds=None):
my_by = By.ID
seconds = seconds or self._wait
if id.startswith("//"):
my_by = By.XPATH
wait = WebDriverWait(self._driver, seconds)
wait.until(ec.visibility_of_element_located((my_by, id)))

def val(self, id, val=None):
if self._driver is None:
return None
self.wait(id)
n = self._driver.find_element_by_id(id)
if val is not None:
n.send_keys(val)
return n.text

def click(self, id):
if self._driver is None:
return None
self.wait(id)
n = self._driver.find_element_by_id(id)
n.click()

def get_session(self):
if self._driver is None:
return None
s = requests.Session()
for cookie in self._driver.get_cookies():
s.cookies.set(cookie['name'], cookie['value'])
#h = self._driver.requests[-1]
#s.headers = h.headers
return s

def pass_cookies(self, session=None):
if session is None:
session = requests.Session()
for cookie in self._driver.get_cookies():
session.cookies.set(cookie['name'], cookie['value'])
return session


def get_session(url):
f = FF()
f.get(url)
w = Web()
w.s = f.get_session()
w.get(url)
f.close()
return w
130 changes: 103 additions & 27 deletions debug/get_strikes.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,129 @@
from core.util import PrintFile
import os
from os.path import isdir, isfile, abspath, dirname, getsize
from os import chdir
import sys
import gzip
import bs4
from bs4 import BeautifulSoup
from bunch import Bunch
from glob import glob
import re
from core.web import Web
from getpass import getpass
import time

if len(sys.argv)==1:
sys.exit("Ha de dar la ruta donde se encuentran los YYYY/ID-PAGE.html.gz")
ROOT = sys.argv[1]
if not os.path.isdir(ROOT):
if not isdir(ROOT):
sys.exit(ROOT+" no exite")
ROOT = os.path.abspath(ROOT).rstrip("/")+"/"
ROOT = abspath(ROOT).rstrip("/")+"/"

abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
abspath = abspath(__file__)
dname = dirname(abspath)
chdir(dname)

re_none = re.compile(r", '(None)?'\);")

username=getpass(prompt='User: ')
password=getpass(prompt='Password: ')
WB = None
pf = PrintFile()
pf.append("strikes.sql")

def get_items():
y=2017
while os.path.isdir(ROOT+str(y)):
def get_response(link, page, intento=1):
if intento > 3:
return None
global WB
if WB is None:
WB=Web()
WB.get("https://www.meneame.net/login")
r = WB.submit("#login-form", username=username, password=password, silent_in_fail=True)
if r is None:
time.sleep(10)
WB=None
return get_response(link, page, intento=intento+1)
WB.get("https://www.meneame.net/story/"+str(link))
if WB.response.status_code == 200:
url = WB.response.url+"/standard/"+str(page)
WB.get(url)
if WB.response.status_code == 200:
return WB.response
WB=None
return get_response(link, page, intento=intento+1)

def get_files(y=2017):
while isdir(ROOT+str(y)):
print("-- "+str(y))
for fl in glob(ROOT+str(y)+"/*.html.gz"):
link = fl.split("/")[-1]
link = link.split("-")[0]
link, page = link.split("-", 1)
link = int(link)
with gzip.open(fl,'r') as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
for div in soup.select("div.comment.strike"):
comment = div.select_one("a.comment-expand")
comment = comment.attrs["data-id"]
comment = int(comment)
reason = div.select_one("div.comment-text a")
if reason:
reason = reason.get_text().strip()
reason = reason.split(":", 1)
reason = reason[-1].strip()
yield Bunch(
link=link,
comment=comment,
reason=reason
)
page = page.split(".")[0]
page = int(page)
if getsize(fl) == 0:
fl = fl[:-3]
if not isfile(fl):
r = get_response(link, page)
if r is None:
print("-- size=0 "+fl)
continue
else:
with open(fl, "wb") as f:
f.write(r.content)
yield Bunch(
year=y,
link=link,
page=page,
file=fl
)
y = y + 1

def get_soup(fl):
f = None
soup = None
try:
if fl.endswith(".gz"):
f = gzip.open(fl, 'r')
else:
f = open(fl, "r", errors='ignore')
soup = BeautifulSoup(f.read(), "html.parser")
except UnicodeDecodeError:
print("-- UnicodeDecodeError "+fl)
except OSError:
print("-- OSError "+fl)
if f is not None:
f.close()
return soup

def get_nodes(select=None):
for fl in get_files():
if getsize(fl.file) == 0:
print("-- size=0 "+fl.file)
continue
soup = get_soup(fl.file)
if soup is not None:
if select is None:
yield fl.link, soup
else:
for node in soup.select(select):
yield fl.link, node

def get_items():
for link, div in get_nodes("div.comment.strike"):
comment = div.select_one("a.comment-expand")
comment = comment.attrs["data-id"]
comment = int(comment)
reason = div.select_one("div.comment-text a")
if reason:
reason = reason.get_text().strip()
reason = reason.split(":", 1)
reason = reason[-1].strip()
yield Bunch(
link=link,
comment=comment,
reason=reason
)

sql = """
insert into STRIKES (link, comment, reason) values
({link}, {comment}, '{reason}');
Expand Down

0 comments on commit 6d006a4

Please sign in to comment.