Skip to content

Commit

Permalink
Update scraper to use sync
Browse files Browse the repository at this point in the history
  • Loading branch information
Volara committed Oct 8, 2024
1 parent 7937e43 commit ffb8d49
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 4 deletions.
7 changes: 3 additions & 4 deletions volara_proof/proofs/proof_of_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from volara_proof.buffers.tweet import Tweet
from volara_proof.models.tweet_info import TweetInfo
from volara_proof.storage.tweet_info import TweetInfoStorage
from twitter.scraper import Scraper, Operation, batch_ids
from volara_proof.models.proof_config import ProofConfig
from volara_proof.scraper.VolaraScraper import VolaraScraper


def get_scraper(config: ProofConfig):
cookies = json.loads(config.cookies)
return Scraper(cookies=cookies)
return VolaraScraper(cookies=cookies)


tweet_info_storage = TweetInfoStorage()
Expand Down Expand Up @@ -91,8 +91,7 @@ def _validate_tweets(
def _scrape_tweets(tweet_ids: list[str], config: ProofConfig) -> list[dict[str, T.Any]]:
try:
scraper = get_scraper(config)
scraped_tweets = scraper.tweets_by_ids(tweet_ids)
tweets = scraped_tweets[0]["data"]["tweetResult"]
tweets = scraper.get_tweets_by_ids(tweet_ids)
return tweets
except Exception as e:
logging.exception(
Expand Down
105 changes: 105 additions & 0 deletions volara_proof/scraper/VolaraScraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import math
import requests

from twitter.scraper import Scraper, Operation, batch_ids
from twitter.util import get_headers, build_params, find_key, get_cursor


class VolaraScraper(Scraper):
def get_tweets_by_ids(self, tweet_ids: list[str]):
operation = Operation.TweetResultsByRestIds
queries = list(batch_ids(tweet_ids))
keys, _, _ = operation
_queries = [{k: q} for q in queries for k, v in keys.items()]
resp = self._process_sync(operation, _queries)
scraped_tweets = resp[0][0].json()
return scraped_tweets["data"]["tweetResult"]

def _process_sync(self, operation: tuple, queries: list[dict], **kwargs):
headers = self.session.headers if self.guest else get_headers(self.session)
cookies = self.session.cookies
return [
self._paginate_sync(dict(headers), dict(cookies), operation, **q, **kwargs)
for q in queries
]

def _paginate_sync(
self,
headers: dict[str, str],
cookies: dict[str, str],
operation: tuple,
**kwargs,
):
limit = kwargs.pop("limit", math.inf)
cursor = kwargs.pop("cursor", None)
is_resuming = False
dups = 0
DUP_LIMIT = 3
if cursor:
is_resuming = True
res = []
ids = set()
else:
try:
r = self._query_sync(operation, headers, cookies, **kwargs)
initial_data = r.json()
res = [r]
ids = {x for x in find_key(initial_data, "rest_id") if x[0].isnumeric()}

cursor = get_cursor(initial_data)
except Exception as e:
if self.debug:
self.logger.error(f"Failed to get initial pagination data: {e}")
return
while (dups < DUP_LIMIT) and cursor:
prev_len = len(ids)
if prev_len >= limit:
break
try:
r = self._query_sync(
operation, headers, cookies, cursor=cursor, **kwargs
)
data = r.json()
except Exception as e:
if self.debug:
self.logger.error(f"Failed to get pagination data\n{e}")
return
cursor = get_cursor(data)
ids |= {x for x in find_key(data, "rest_id") if x[0].isnumeric()}

if self.debug:
self.logger.debug(f"Unique results: {len(ids)}\tcursor: {cursor}")
if prev_len == len(ids):
dups += 1
res.append(r)
if is_resuming:
return res, cursor
return res

def _query_sync(
self,
operation: tuple,
headers: dict[str, str],
cookies: dict[str, str],
**kwargs,
) -> requests.Response:
keys, qid, name = operation
params = {
"variables": Operation.default_variables | kwargs,
"features": Operation.default_features,
}
params = build_params(params)
resp = requests.get(
f"https://twitter.com/i/api/graphql/{qid}/{name}",
params=params,
headers=headers,
cookies=cookies,
timeout=20,
)
try:
self.rate_limits[name] = {
k: int(v) for k, v in resp.headers.items() if "rate-limit" in k
}
except Exception as e:
self.logger.debug(f"{e}")
return resp

0 comments on commit ffb8d49

Please sign in to comment.