Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,5 @@ disallow_untyped_calls = true
ignore_missing_imports = true

[tool.poe.tasks]
pylint-local = "pylint scraperaphai/**/*.py"
pylint-local = "pylint scrapegraphai/**/*.py"
pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py"
42 changes: 24 additions & 18 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,10 @@ def __init__(
self.load_state = load_state
self.requires_js_support = requires_js_support
self.storage_state = storage_state
self.backend = kwargs.get("backend", backend)
self.browser_name = kwargs.get("browser_name", browser_name)
self.retry_limit = kwargs.get("retry_limit", retry_limit)
self.timeout = kwargs.get("timeout", timeout)
self.backend = backend
self.browser_name = browser_name
self.retry_limit = retry_limit
self.timeout = timeout

async def scrape(self, url: str) -> str:
if self.backend == "playwright":
Expand Down Expand Up @@ -159,7 +159,8 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
f"Error: Network error after {self.retry_limit} attempts - {e}"
)
finally:
driver.quit()
if "driver" in dir():
driver.quit()

return results

Expand Down Expand Up @@ -206,7 +207,7 @@ async def ascrape_playwright_scroll(
# https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
# In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?

if timeout and timeout <= 0:
if timeout is not None and timeout <= 0:
raise ValueError(
"If set, timeout value for scrolling scraper must be greater than 0."
)
Expand Down Expand Up @@ -316,7 +317,8 @@ async def ascrape_playwright_scroll(
f"Error: Network error after {self.retry_limit} attempts - {e}"
)
finally:
await browser.close()
if browser is not None:
await browser.close()

return results

Expand Down Expand Up @@ -434,7 +436,19 @@ async def ascrape_with_js_support(
f"Failed to scrape after {self.retry_limit} attempts: {str(e)}"
)
finally:
await browser.close()
if browser is not None:
await browser.close()

def _get_scraping_fn(self):
"""Return the appropriate scraping function based on backend config."""
if self.requires_js_support:
return self.ascrape_with_js_support
if self.backend == "playwright":
return self.ascrape_playwright
elif self.backend == "selenium":
return self.ascrape_undetected_chromedriver
else:
raise ValueError(f"Unsupported backend: {self.backend}")

def lazy_load(self) -> Iterator[Document]:
"""
Expand All @@ -446,11 +460,7 @@ def lazy_load(self) -> Iterator[Document]:
Yields:
Document: The scraped content encapsulated within a Document object.
"""
scraping_fn = (
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)
scraping_fn = self._get_scraping_fn()

for url in self.urls:
html_content = asyncio.run(scraping_fn(url))
Expand All @@ -470,11 +480,7 @@ async def alazy_load(self) -> AsyncIterator[Document]:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
scraping_fn = (
self.ascrape_with_js_support
if self.requires_js_support
else getattr(self, f"ascrape_{self.backend}")
)
scraping_fn = self._get_scraping_fn()

tasks = [scraping_fn(url) for url in self.urls]
results = await asyncio.gather(*tasks)
Expand Down
36 changes: 36 additions & 0 deletions tests/test_json_scraper_multi_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Tests for JSONScraperMultiGraph.
"""
import pytest
from scrapegraphai.graphs import JsonScraperMultiGraph


@pytest.fixture
def mock_json_config():
return {
"llm": {
"model": "mock-model",
},
}


class TestJsonScraperMultiGraph:
"""Test suite for JsonScraperMultiGraph."""

def test_initialization(self, mock_json_config):
"""Test that the graph can be initialized with basic config."""
graph = JsonScraperMultiGraph(
prompt="Extract data",
source="[{\"test\": \"data\"}]",
config=mock_json_config,
)
assert graph is not None

def test_empty_config_raises_error(self):
"""Test that empty config raises appropriate error."""
with pytest.raises(Exception):
JsonScraperMultiGraph(
prompt="Extract data",
source="[{\"test\": \"data\"}]",
config={},
)
36 changes: 36 additions & 0 deletions tests/test_smart_scraper_multi_concat_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Tests for SmartScraperMultiConcatGraph.
"""
import pytest
from scrapegraphai.graphs import SmartScraperMultiConcatGraph


@pytest.fixture
def mock_concat_config():
return {
"llm": {
"model": "mock-model",
},
}


class TestSmartScraperMultiConcatGraph:
"""Test suite for SmartScraperMultiConcatGraph."""

def test_initialization(self, mock_concat_config):
"""Test that the graph can be initialized with basic config."""
graph = SmartScraperMultiConcatGraph(
prompt="Extract data",
source=["https://example.com"],
config=mock_concat_config,
)
assert graph is not None

def test_empty_sources_raises_error(self, mock_concat_config):
"""Test that empty sources list raises appropriate error."""
with pytest.raises(Exception):
SmartScraperMultiConcatGraph(
prompt="Extract data",
source=[],
config=mock_concat_config,
)
Loading