Skip to content

Commit

Permalink
Merge main into sweep/add-sweep-config
Browse files Browse the repository at this point in the history
  • Loading branch information
sweep-ai[bot] authored Dec 8, 2023
2 parents 60d981c + e246a97 commit 61dd9ea
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 9 deletions.
29 changes: 22 additions & 7 deletions scrapy_crawler/Bungae/TotalSearch/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from scrapy.exceptions import DropItem

from scrapy_crawler.Bungae.TotalSearch.spiders.BgKeywordSpider import BgKeywordSpider
from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.db.models import LogCrawler, RawUsedItem
from scrapy_crawler.common.utils.constants import BunJang
from scrapy_crawler.common.utils.custom_exceptions import (
DropDuplicateItem,
Expand All @@ -12,9 +11,9 @@
DropTooLowPriceItem,
)
from scrapy_crawler.common.utils.helpers import (
get_local_timestring,
has_forbidden_keyword,
publish_sqs_message,
save_image_from_url,
too_long_text,
too_low_price,
)
Expand Down Expand Up @@ -95,12 +94,28 @@ def process_item(self, item, spider: BgKeywordSpider):
self.session = spider.session
spider.logger.info(f"[{type(self).__name__}] start process_item {item['pid']}")
try:
# adapter = ItemAdapter(item)
# self.session.add(
# LogCrawler(
# url=BunJang.ARTICLE_URL % str(item["pid"]),
# source=SourceEnum.BUNGAE.value,
# created_at=get_local_timestring(),
# item_status="CRAWLED",
# )
# )

self.session.add(
LogCrawler(
RawUsedItem(
writer=item["writer"],
title=item["title"],
content=item["content"],
price=item["price"],
date=item["date"],
source=item["source"],
url=BunJang.ARTICLE_URL % str(item["pid"]),
source=SourceEnum.BUNGAE.value,
created_at=get_local_timestring(),
item_status="CRAWLED",
img_url=item["img_url"],
image=save_image_from_url(item["img_url"]).getvalue(),
raw_json=item["raw_json"],
)
)

Expand Down
20 changes: 18 additions & 2 deletions scrapy_crawler/Joonggonara/TotalSearch/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
from itemadapter import ItemAdapter
from scrapy import Selector

from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.db.models import LogCrawler, RawUsedItem
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.utils import has_forbidden_keyword, too_low_price
from scrapy_crawler.common.utils.custom_exceptions import (
DropDuplicateItem,
DropForbiddenKeywordItem,
DropTooLowPriceItem,
)
from scrapy_crawler.common.utils.helpers import publish_sqs_message
from scrapy_crawler.common.utils.helpers import publish_sqs_message, save_image_from_url
from scrapy_crawler.Joonggonara.TotalSearch.spiders.JgKeywordSpider import (
JgKeywordSpider,
)
Expand Down Expand Up @@ -171,6 +171,22 @@ def process_item(self, item, spider: JgKeywordSpider):
created_at=adapter["date"],
)
)
image = save_image_from_url(adapter["img_url"] + "?type=w300")

self.session.add(
RawUsedItem(
writer=adapter["writer"],
title=adapter["title"],
content=adapter["content"],
price=adapter["price"],
date=adapter["date"],
url=adapter["url"],
img_url=adapter["img_url"],
source=adapter["source"],
image=image.getvalue(),
raw_json=adapter["raw_json"],
)
)
self.session.commit()
spider.logger.info(
f"[{type(self).__name__}][{item['url'].split('/')[-1]}] item saved"
Expand Down

0 comments on commit 61dd9ea

Please sign in to comment.