mirror of
https://github.com/The-CodingSloth/sloth-search.git
synced 2025-04-19 08:56:09 +00:00
first commit
This commit is contained in:
commit
7771dffa6e
19 changed files with 1639 additions and 0 deletions
164
.gitignore
vendored
Normal file
164
.gitignore
vendored
Normal file
|
@ -0,0 +1,164 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
.DS_Store
|
21
LICENSE
Normal file
21
LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 The Coding Sloth
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
93
README.md
Normal file
93
README.md
Normal file
|
@ -0,0 +1,93 @@
|
|||
# Sloth Search - A Google-like Search Engine Clone
|
||||
|
||||
Sloth Search is a project that aims to recreate Google, including crawling, indexing, and serving results through a user-friendly front-end interface. The project consists of three main components: the Client, Search, and Server.
|
||||
[Check out the video for a full explanation here](https://youtu.be/WCpimlH0Kck?si=_zFzrb1cxZinWKo3)
|
||||
|
||||
## Project Structure
|
||||
|
||||
The project is divided into the following folders:
|
||||
|
||||
- **Client**: Contains the front-end code, providing a user interface similar to Google search, where users can enter queries and view search results.
|
||||
- **Search**: Contains the core components of Sloth Search, which replicate the three main parts of Google:
|
||||
- **Crawling**: The web crawler that collects information from the web.
|
||||
- **Indexing**: Processing and storing the content collected by the crawler for efficient searching.
|
||||
- **Serving (PageRank)**: Serving search results based on their relevance and PageRank algorithm.
|
||||
- **Server**: Contains the search API used to handle client requests and provide search results.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
1. **Clone the Repository**
|
||||
|
||||
```sh
|
||||
git clone <repository-url>
|
||||
cd sloth-search
|
||||
```
|
||||
|
||||
2. ## Install the necessary Python dependencies, run:
|
||||
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Client Setup**
|
||||
|
||||
- The client contains the HTML, CSS, and JavaScript code to run the front-end.
|
||||
- Open the `index.html` file in your browser, or use a static file server to serve the client code locally.
|
||||
- You can also use the live server extension.
|
||||
|
||||
4. **Search Setup**
|
||||
|
||||
- The `Search` directory contains the code for crawling, indexing, and serving.
|
||||
- You can start the process by running:
|
||||
```sh
|
||||
python search/complete_exmaples/advanced_pagerank.py
|
||||
```
|
||||
- This will crawl, index, and prepare the content for searching.
|
||||
- If you want to run any other files do the same process:
|
||||
|
||||
```sh
|
||||
python search/<path to file you want to run>
|
||||
```
|
||||
|
||||
4. **Search Setup**
|
||||
- The server uses Flask to provide an API for search queries.
|
||||
- Start the Flask server by navigating to the `Server` directory and running:
|
||||
```sh
|
||||
python google_search_api.py
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Crawling**
|
||||
|
||||
- The crawler starts with a set of seed URLs and collects links and content from the web.
|
||||
- It respects `robots.txt` to avoid being blocked and to ensure ethical crawling.
|
||||
- Parsed data is stored in a format ready for indexing.
|
||||
|
||||
2. **Indexing**
|
||||
|
||||
- The indexing module processes the crawled pages.
|
||||
- The content is tokenized, cleaned, stemmed, and stop words are removed using the NLTK library.
|
||||
- The resulting indexed data is saved to be used by the search API.
|
||||
|
||||
3. **Serving and PageRank**
|
||||
- The PageRank algorithm is used to rank pages based on their importance.
|
||||
- When a user searches for a query through the client, the server uses the indexed data and PageRank scores to return the most relevant pages.
|
||||
|
||||
## Important Notes
|
||||
|
||||
- **Respecting Websites**: The crawler respects `robots.txt` rules. Please make sure not to overload any websites.
|
||||
- **PageRank Algorithm**: The implementation of the PageRank algorithm uses an iterative approach to rank pages based on the links.
|
||||
- **Data Storage**: The crawler and indexer use CSV files for data storage (`advanced_pagerank_inverted_index.csv` and `advanced_pagerank.csv`). Make sure these files are writable during execution.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! If you'd like to contribute to the development of Sloth Search, feel free to fork the repository, make changes, and submit a pull request.
|
||||
|
||||
## License
|
||||
|
||||
This project is open-source and available under the MIT License.
|
||||
|
||||
If you have any questions or suggestions, feel free to contact me.
|
||||
|
||||
Happy Searching with Sloth Search! 🦥🔍
|
1
client/images/google_camera.svg
Normal file
1
client/images/google_camera.svg
Normal file
|
@ -0,0 +1 @@
|
|||
<svg class="Gdd5U" focusable="false" viewBox="0 0 192 192" xmlns="http://www.w3.org/2000/svg"><rect fill="none" height="192" width="192"></rect><g><circle fill="#34a853" cx="144.07" cy="144" r="16"></circle><circle fill="#4285f4" cx="96.07" cy="104" r="24"></circle><path fill="#ea4335" d="M24,135.2c0,18.11,14.69,32.8,32.8,32.8H96v-16l-40.1-0.1c-8.8,0-15.9-8.19-15.9-17.9v-18H24V135.2z"></path><path fill="#fbbc04" d="M168,72.8c0-18.11-14.69-32.8-32.8-32.8H116l20,16c8.8,0,16,8.29,16,18v30h16V72.8z"></path><path fill="#4285f4" d="M112,24l-32,0L68,40H56.8C38.69,40,24,54.69,24,72.8V92h16V74c0-9.71,7.2-18,16-18h80L112,24z"></path></g></svg>
|
After Width: | Height: | Size: 641 B |
1
client/images/google_mic.svg
Normal file
1
client/images/google_mic.svg
Normal file
|
@ -0,0 +1 @@
|
|||
<svg class="goxjub" focusable="false" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path fill="#4285f4" d="m12 15c1.66 0 3-1.31 3-2.97v-7.02c0-1.66-1.34-3.01-3-3.01s-3 1.34-3 3.01v7.02c0 1.66 1.34 2.97 3 2.97z"></path><path fill="#34a853" d="m11 18.08h2v3.92h-2z"></path><path fill="#fbbc04" d="m7.05 16.87c-1.27-1.33-2.05-2.83-2.05-4.87h2c0 1.45 0.56 2.42 1.47 3.38v0.32l-1.15 1.18z"></path><path fill="#ea4335" d="m12 16.93a4.97 5.25 0 0 1 -3.54 -1.55l-1.41 1.49c1.26 1.34 3.02 2.13 4.95 2.13 3.87 0 6.99-2.92 6.99-7h-1.99c0 2.92-2.24 4.93-5 4.93z"></path></svg>
|
After Width: | Height: | Size: 574 B |
1
client/images/google_search_icon.svg
Normal file
1
client/images/google_search_icon.svg
Normal file
|
@ -0,0 +1 @@
|
|||
<svg focusable="false" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M15.5 14h-.79l-.28-.27A6.471 6.471 0 0 0 16 9.5 6.5 6.5 0 1 0 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"></path></svg>
|
After Width: | Height: | Size: 304 B |
BIN
client/images/sloth_search.png
Normal file
BIN
client/images/sloth_search.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 22 KiB |
77
client/index.html
Normal file
77
client/index.html
Normal file
|
@ -0,0 +1,77 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>am real programmer</title>
|
||||
<link rel="stylesheet" href="styles.css" />
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="top-section">
|
||||
<div class="left-side">
|
||||
<a href="https://about.google/">About</a>
|
||||
<a href="https://store.google.com/">Store</a>
|
||||
</div>
|
||||
<div class="right-side">
|
||||
<a href="https://gmail.com/">Gmail</a>
|
||||
<a href="https://images.google.com/">Images</a>
|
||||
<img
|
||||
class="app-icon"
|
||||
src="https://cdn3.iconfinder.com/data/icons/feather-5/24/more-vertical-512.png"
|
||||
/>
|
||||
<img
|
||||
class="profile-pic"
|
||||
src="https://i.pinimg.com/1200x/b9/c4/7e/b9c47ef70bff06613d397abfce02c6e7.jpg"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div class="middle-section">
|
||||
<a href="index.html">
|
||||
<img class="search-logo-home" src="images/sloth_search.png" />
|
||||
</a>
|
||||
<form
|
||||
class="search-form"
|
||||
id="search-form"
|
||||
action="search.html"
|
||||
method="get"
|
||||
autocomplete="off"
|
||||
>
|
||||
<div class="search-form-input">
|
||||
<img
|
||||
class="search-icon-home"
|
||||
src="https://cdn-icons-png.flaticon.com/512/3031/3031293.png"
|
||||
/>
|
||||
<input type="text" name="search" id="search-input" />
|
||||
<img class="mic" src="./images/google_mic.svg" />
|
||||
<img class="camera" src="./images/google_camera.svg" />
|
||||
</div>
|
||||
<div class="buttons">
|
||||
<button type="submit" type="submit" id="search-button">
|
||||
Sloth Search
|
||||
</button>
|
||||
<button type="submit">I'm Feeling unlucky</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div class="bottom-section">
|
||||
<div class="bottom-left">
|
||||
<a href="https://ads.google.com/">Advertising</a>
|
||||
<a href="https://smallbusiness.withgoogle.com/#!/">Business</a>
|
||||
<a href="https://www.google.com/search/howsearchworks/?fg=1"
|
||||
>How Search works</a
|
||||
>
|
||||
</div>
|
||||
<div class="bottom-middle">
|
||||
<a href="https://sustainability.google/carbon-free/#home"
|
||||
>Carbon Neutral since 2007</a
|
||||
>
|
||||
</div>
|
||||
<div class="bottom-right">
|
||||
<a href="https://policies.google.com/privacy?hl=en&fg=1">Privacy</a>
|
||||
<a href="https://policies.google.com/terms?hl=en&fg=1">Terms</a>
|
||||
<a href="https://www.google.com/settings">Settings</a>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
119
client/search.html
Normal file
119
client/search.html
Normal file
|
@ -0,0 +1,119 @@
|
|||
<!-- search.html -->
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>Search Results - My Search Engine</title>
|
||||
<link rel="stylesheet" href="styles.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="search-result-area">
|
||||
<a href="index.html">
|
||||
<img class="search-logo-result" src="images/sloth_search.png" />
|
||||
</a>
|
||||
|
||||
<form
|
||||
class="search-form"
|
||||
id="search-form"
|
||||
action="search.html"
|
||||
method="get"
|
||||
autocomplete="off"
|
||||
>
|
||||
<div class="search-form-input">
|
||||
<input type="text" name="search" id="search-input" />
|
||||
<img class="mic" src="./images/google_mic.svg" />
|
||||
<img class="camera" src="./images/google_camera.svg" />
|
||||
<img
|
||||
class="search-icon-result"
|
||||
src="./images/google_search_icon.svg"
|
||||
/>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
<div id="results"></div>
|
||||
<div id="pagination"></div>
|
||||
<script>
|
||||
// Get the query parameter from the URL
|
||||
const urlParams = new URLSearchParams(window.location.search);
|
||||
const query = urlParams.get('search');
|
||||
document.getElementById('search-input').value = query;
|
||||
let page = parseInt(urlParams.get('page')) || 1;
|
||||
const numResults = 10; // Adjust as needed
|
||||
|
||||
// Add pagination controls
|
||||
function addPaginationControls(totalResults) {
|
||||
const paginationDiv = document.getElementById('pagination');
|
||||
paginationDiv.innerHTML = ''; // Clear any existing pagination controls
|
||||
|
||||
if (page > 1) {
|
||||
const prevLink = document.createElement('a');
|
||||
prevLink.href = `search.html?search=${encodeURIComponent(
|
||||
query
|
||||
)}&page=${page - 1}`;
|
||||
prevLink.textContent = 'Previous';
|
||||
paginationDiv.appendChild(prevLink);
|
||||
}
|
||||
|
||||
if (totalResults === numResults) {
|
||||
const nextLink = document.createElement('a');
|
||||
nextLink.href = `search.html?search=${encodeURIComponent(
|
||||
query
|
||||
)}&page=${page + 1}`;
|
||||
nextLink.textContent = 'Next';
|
||||
paginationDiv.appendChild(nextLink);
|
||||
}
|
||||
}
|
||||
|
||||
// Function to fetch and display search results
|
||||
async function fetchResults() {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`http://127.0.0.1:5000/search?q=${encodeURIComponent(
|
||||
query
|
||||
)}&page=${page}&num_results=${numResults}`
|
||||
);
|
||||
const data = await response.json();
|
||||
|
||||
const resultsDiv = document.getElementById('results');
|
||||
|
||||
if (data.results.length === 0) {
|
||||
resultsDiv.innerHTML = `<p>No results found for "<strong>${query}</strong>".</p>`;
|
||||
return;
|
||||
}
|
||||
|
||||
data.results.forEach((result) => {
|
||||
const resultDiv = document.createElement('div');
|
||||
resultDiv.classList.add('result');
|
||||
|
||||
const titleLink = document.createElement('a');
|
||||
titleLink.classList.add('result-title');
|
||||
titleLink.href = result.url;
|
||||
titleLink.textContent = result.title || result.url;
|
||||
|
||||
const urlDiv = document.createElement('div');
|
||||
urlDiv.classList.add('result-url');
|
||||
urlDiv.textContent = result.url;
|
||||
|
||||
const descriptionDiv = document.createElement('div');
|
||||
descriptionDiv.classList.add('result-description');
|
||||
descriptionDiv.textContent = result.description;
|
||||
|
||||
resultDiv.appendChild(titleLink);
|
||||
resultDiv.appendChild(urlDiv);
|
||||
resultDiv.appendChild(descriptionDiv);
|
||||
|
||||
resultsDiv.appendChild(resultDiv);
|
||||
});
|
||||
|
||||
// Add pagination controls
|
||||
addPaginationControls(data.results.length);
|
||||
} catch (error) {
|
||||
console.error('Error fetching search results:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch and display results when the page loads
|
||||
fetchResults();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
247
client/styles.css
Normal file
247
client/styles.css
Normal file
|
@ -0,0 +1,247 @@
|
|||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
|
||||
font-family: 'Roboto', sans-serif;
|
||||
}
|
||||
|
||||
body {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
min-height: 100vh;
|
||||
/* ensures the body takes up at least the full viewport height */
|
||||
}
|
||||
|
||||
a {
|
||||
all: unset;
|
||||
text-decoration: none;
|
||||
/* no underline */
|
||||
}
|
||||
|
||||
.top-section {
|
||||
padding: 1rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.app-icon {
|
||||
width: 1.5rem;
|
||||
height: 1.5rem;
|
||||
}
|
||||
|
||||
.profile-pic {
|
||||
width: 2rem;
|
||||
height: 2rem;
|
||||
border-radius: 100%;
|
||||
}
|
||||
|
||||
.left-side {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.right-side {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.left-side a,
|
||||
.right-side a {
|
||||
color: #202124;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.middle-section {
|
||||
flex-grow: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
padding: 1rem 0;
|
||||
gap: 1.2rem;
|
||||
}
|
||||
|
||||
.search-label {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.search-form {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 2.5rem;
|
||||
}
|
||||
.result-search-form {
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 2.5rem;
|
||||
}
|
||||
|
||||
.search-form-input {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 1rem;
|
||||
border: 1px solid #dfe1e5;
|
||||
border-radius: 30px;
|
||||
padding: 0.3rem 1.5rem;
|
||||
box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.search-form input {
|
||||
width: 27rem;
|
||||
padding: 0.5rem;
|
||||
border: none;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.buttons {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.search-form button {
|
||||
border: 1px solid #f8f9fa;
|
||||
padding: 0.5rem 1rem;
|
||||
background-color: #f8f9fa;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
.search-icon-home {
|
||||
width: 1rem;
|
||||
height: 1rem;
|
||||
}
|
||||
.search-icon-result {
|
||||
width: 1.5rem;
|
||||
height: 1.5rem;
|
||||
}
|
||||
.mic,
|
||||
.camera {
|
||||
width: 1.5rem;
|
||||
height: 1.5rem;
|
||||
}
|
||||
|
||||
.bottom-section {
|
||||
margin-top: 15rem;
|
||||
padding: 1rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
background-color: #f2f2f2;
|
||||
font-size: 0.9em;
|
||||
padding-left: 2rem;
|
||||
padding-right: 2rem;
|
||||
}
|
||||
|
||||
.bottom-left,
|
||||
.bottom-right {
|
||||
display: flex;
|
||||
gap: 1.8rem;
|
||||
}
|
||||
|
||||
.bottom-middle {
|
||||
padding-right: 10rem;
|
||||
}
|
||||
|
||||
.bottom-section a {
|
||||
color: #70757a;
|
||||
}
|
||||
|
||||
.search-form button {
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #f8f9fa;
|
||||
border-radius: 4px;
|
||||
color: #3c4043;
|
||||
font-family: Roboto, arial, sans-serif;
|
||||
font-size: 14px;
|
||||
margin: 11px 4px;
|
||||
padding: 0 16px;
|
||||
line-height: 27px;
|
||||
height: 36px;
|
||||
min-width: 54px;
|
||||
text-align: center;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
.bottom-section {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
background-color: #f2f2f2;
|
||||
padding: 1rem 1.5rem;
|
||||
margin-top: 15rem;
|
||||
}
|
||||
|
||||
.bottom-section a {
|
||||
margin: 0 1rem;
|
||||
}
|
||||
|
||||
.bottom-middle {
|
||||
margin-right: 8rem;
|
||||
}
|
||||
|
||||
.search-result-area {
|
||||
display: flex;
|
||||
padding-left: 1rem;
|
||||
gap: 1rem;
|
||||
}
|
||||
.search-logo-home {
|
||||
width: 20rem;
|
||||
}
|
||||
.search-logo-result {
|
||||
width: 7rem;
|
||||
}
|
||||
|
||||
#results {
|
||||
padding-top: 1rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
padding-left: 2rem;
|
||||
padding-right: 2rem;
|
||||
}
|
||||
.result:hover {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.result-description {
|
||||
font-size: 0.8rem;
|
||||
width: 50%;
|
||||
color: #545454;
|
||||
}
|
||||
.result {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.result-title {
|
||||
font-size: 18px;
|
||||
color: #1a0dab;
|
||||
text-decoration: none;
|
||||
}
|
||||
.result-title:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.result-url {
|
||||
font-size: 14px;
|
||||
color: #006621;
|
||||
}
|
||||
#pagination {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
gap: 1.5rem;
|
||||
padding: 2rem;
|
||||
font-size: 1.2rem;
|
||||
}
|
||||
|
||||
#pagination a {
|
||||
color: #1a0dab;
|
||||
}
|
||||
|
||||
#pagination a:hover {
|
||||
text-decoration: underline;
|
||||
cursor: pointer;
|
||||
}
|
239
search/complete_examples/advanced_pagerank.py
Normal file
239
search/complete_examples/advanced_pagerank.py
Normal file
|
@ -0,0 +1,239 @@
|
|||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from queue import Queue
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import threading
|
||||
from urllib.parse import urlparse
|
||||
import csv
|
||||
import sys
|
||||
import os
|
||||
# Add the root directory to sys.path
|
||||
# This is to be able to import modules from other directories (indexing and serving) idk why...
|
||||
# any imports from indexing/serving need to happen under this
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from indexing.advanced_indexing import advanced_index_page
|
||||
from serving.pagerank import compute_pagerank
|
||||
|
||||
|
||||
# Function to check robots.txt for permission to crawl
|
||||
# If we don't do this, we could get blocked/banned
|
||||
# since we don't have permission to crawl.
|
||||
def can_crawl(url):
|
||||
parsed_url = urlparse(url)
|
||||
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
||||
print(f"Checking robots.txt for: {robots_url}")
|
||||
time.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
response = requests.get(robots_url, timeout=5)
|
||||
response.raise_for_status()
|
||||
disallowed_paths = []
|
||||
for line in response.text.splitlines():
|
||||
if line.startswith("Disallow"):
|
||||
parts = line.split()
|
||||
if len(parts) > 1:
|
||||
disallowed_paths.append(parts[1])
|
||||
for path in disallowed_paths:
|
||||
if urlparse(url).path.startswith(path):
|
||||
print(f"Disallowed by robots.txt: {url}")
|
||||
return False
|
||||
return True
|
||||
except requests.RequestException:
|
||||
print(f"Failed to access robots.txt: {robots_url}")
|
||||
return False # If we can't access robots.txt, assume we can't crawl (we're being nice here)
|
||||
|
||||
# Function to fetch and parse URL
|
||||
def crawl(args):
|
||||
queue = args['queue']
|
||||
visited_urls = args['visited_urls']
|
||||
crawl_count = args['crawl_count']
|
||||
CRAWL_LIMIT = args['CRAWL_LIMIT']
|
||||
lock = args['lock']
|
||||
index = args['index']
|
||||
webpage_info = args['webpage_info']
|
||||
webpage_id_counter = args['webpage_id_counter']
|
||||
pagerank_graph = args['pagerank_graph']
|
||||
stop_crawl = args['stop_crawl']
|
||||
|
||||
while not stop_crawl.is_set():
|
||||
try:
|
||||
current_url = queue.get(timeout=5)
|
||||
print("Time to crawl: " + current_url)
|
||||
except Exception:
|
||||
break # Exit if no more URLs are available to crawl
|
||||
|
||||
with lock:
|
||||
if crawl_count[0] >= CRAWL_LIMIT:
|
||||
queue.queue.clear() # Clear remaining URLs to stop processing
|
||||
print("Crawl limit reached. Exiting...")
|
||||
stop_crawl.set()
|
||||
break
|
||||
if current_url in visited_urls:
|
||||
queue.task_done()
|
||||
continue
|
||||
visited_urls.add(current_url)
|
||||
|
||||
""" Checks for noindex directive in the page
|
||||
Comment this out if you don't care about noindex
|
||||
WARNING: websites could block/ban you if you don't have permission
|
||||
"""
|
||||
if not can_crawl(current_url):
|
||||
queue.task_done()
|
||||
continue
|
||||
|
||||
time.sleep(random.uniform(2, 5))
|
||||
try:
|
||||
response = requests.get(current_url, timeout=5)
|
||||
response.raise_for_status() # Check for request errors
|
||||
content = response.content
|
||||
|
||||
""" Checks for noindex directive in the page
|
||||
Comment this out if you don't care about noindex
|
||||
WARNING: websites could block/ban you if you don't have permission
|
||||
"""
|
||||
if 'noindex' in content.decode('utf-8').lower():
|
||||
print(f"Noindex found, skipping: {current_url}")
|
||||
queue.task_done()
|
||||
continue
|
||||
|
||||
|
||||
# Parse the fetched content to find new URLs
|
||||
webpage = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Index the webpage
|
||||
indexed_page = advanced_index_page(webpage, current_url)
|
||||
with lock:
|
||||
for word in indexed_page["words"]:
|
||||
if word not in index:
|
||||
index[word] = set()
|
||||
index[word].add(webpage_id_counter[0])
|
||||
webpage_info[webpage_id_counter[0]] = indexed_page
|
||||
webpage_id_counter[0] += 1
|
||||
|
||||
hyperlinks = webpage.select("a[href]")
|
||||
#NEW: Add hyperlink connections for pagerank
|
||||
new_urls, hyperlink_connections = parse_links(hyperlinks, current_url)
|
||||
pagerank_graph[current_url] = hyperlink_connections
|
||||
|
||||
with lock:
|
||||
for new_url in new_urls:
|
||||
if new_url not in visited_urls:
|
||||
queue.put(new_url)
|
||||
crawl_count[0] += 1
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to fetch {current_url}: {e}")
|
||||
finally:
|
||||
queue.task_done()
|
||||
|
||||
# Function to parse links from HTML content
|
||||
def parse_links(hyperlinks, current_url):
|
||||
urls = []
|
||||
#NEW: Add hyperlink connections for pagerank
|
||||
hyperlink_connections = set()
|
||||
for hyperlink in hyperlinks:
|
||||
url = hyperlink["href"]
|
||||
|
||||
# Format the URL into a proper URL
|
||||
if url.startswith("#"):
|
||||
continue # Skip same-page anchors
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url # Add scheme to protocol-relative URLs
|
||||
elif url.startswith("/"):
|
||||
# Construct full URL for relative links
|
||||
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
|
||||
url = base_url + url
|
||||
elif not url.startswith("http"):
|
||||
continue # Skip non-HTTP links
|
||||
url = url.split("#")[0] # Remove anchor
|
||||
|
||||
hyperlink_connections.add(url)
|
||||
urls.append(url)
|
||||
return urls, hyperlink_connections
|
||||
|
||||
# Main crawling function
|
||||
def sloth_bot():
|
||||
# Start with the initial pages to crawl
|
||||
starting_urls = [
|
||||
"https://www.wikipedia.org/wiki/Google",
|
||||
"https://www.bbc.com/news/world",
|
||||
"https://news.ycombinator.com/",
|
||||
]
|
||||
|
||||
urls_to_crawl = Queue()
|
||||
for seed_url in starting_urls:
|
||||
urls_to_crawl.put(seed_url)
|
||||
|
||||
visited_urls = set() # URL tracking
|
||||
CRAWL_LIMIT = 20 # Set crawl limit
|
||||
crawl_count = [0] # Shared counter
|
||||
lock = threading.Lock() # Thread safety lock
|
||||
index = {}
|
||||
webpage_info = {}
|
||||
#NEW: pagerank graph for pagerank.
|
||||
# This will be used to store the connections between hyperlinks
|
||||
pagerank_graph = {}
|
||||
webpage_id_counter = [0]
|
||||
stop_crawl = threading.Event()
|
||||
|
||||
# Start concurrent crawling with ThreadPoolExecutor
|
||||
#Concurrency = speed
|
||||
#Threads go BRRRRR
|
||||
#Increase this if you want more threads, but be careful with these.
|
||||
NUM_WORKERS = 100
|
||||
#Setting up arguments for the crawl function
|
||||
args = {
|
||||
'queue': urls_to_crawl,
|
||||
'visited_urls': visited_urls,
|
||||
'crawl_count': crawl_count,
|
||||
'CRAWL_LIMIT': CRAWL_LIMIT,
|
||||
'lock': lock,
|
||||
'index': index,
|
||||
'webpage_info': webpage_info,
|
||||
'webpage_id_counter': webpage_id_counter,
|
||||
'pagerank_graph': pagerank_graph,
|
||||
'stop_crawl': stop_crawl
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
|
||||
for _ in range(NUM_WORKERS):
|
||||
executor.submit(crawl, args)
|
||||
|
||||
print("All URLs have been crawled")
|
||||
|
||||
#NEW: Computes pagerank
|
||||
pagerank_scores = compute_pagerank(pagerank_graph)
|
||||
|
||||
|
||||
""" This part is for saving the data to CSV files.
|
||||
However, if you don't want to save the data, you can remove/comment out this part.
|
||||
If you want to use a database, you can replace this part with a database connection.
|
||||
"""
|
||||
with open('advanced_pagerank_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['word', 'doc_ids']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for word, doc_ids in index.items():
|
||||
writer.writerow({'word': word, 'doc_ids': list(doc_ids)})
|
||||
|
||||
with open('advanced_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['doc_id', 'url', 'title', 'description', 'pagerank']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for doc_id, info in webpage_info.items():
|
||||
writer.writerow({
|
||||
'doc_id': doc_id,
|
||||
'url': info['url'],
|
||||
'title': info['title'],
|
||||
'description': info['description'],
|
||||
'pagerank': pagerank_scores.get(info['url'], 0)
|
||||
})
|
||||
|
||||
# Entry point for the script
|
||||
def main():
|
||||
sloth_bot()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
110
search/complete_examples/simple_pagerank.py
Normal file
110
search/complete_examples/simple_pagerank.py
Normal file
|
@ -0,0 +1,110 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
import csv
|
||||
import sys
|
||||
import os
|
||||
# Add the root directory to sys.path
|
||||
# This is to be able to import modules from other directories (indexing and serving) idk why...
|
||||
# any imports from indexing/serving need to happen under this
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
from indexing.simple_indexing import simple_index_page
|
||||
from serving.pagerank import compute_pagerank
|
||||
|
||||
def sloth_bot():
|
||||
# Our list of URLs to crawl
|
||||
urls = ["https://en.wikipedia.org/wiki/Google"]
|
||||
visited_urls = set()
|
||||
|
||||
# Create the index and graph
|
||||
index = {} # URL -> page contents
|
||||
pagerank_graph = {} # URL -> set of URLs it links to
|
||||
CRAWL_LIMIT = 5
|
||||
crawl_count = 0
|
||||
|
||||
# Loops through the list of URLs
|
||||
while urls and crawl_count < CRAWL_LIMIT:
|
||||
# Grab the next URL
|
||||
current_url = urls.pop()
|
||||
if current_url in visited_urls:
|
||||
continue
|
||||
print("Time to crawl: " + current_url)
|
||||
time.sleep(random.uniform(1, 2))
|
||||
try:
|
||||
response = requests.get(current_url)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to retrieve {current_url}: {e}")
|
||||
continue
|
||||
|
||||
# Parse the content of the page
|
||||
webpage = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Add the page to the index
|
||||
indexed_page = simple_index_page(webpage, current_url)
|
||||
index[current_url] = indexed_page
|
||||
visited_urls.add(current_url)
|
||||
|
||||
# Grab the links from the page
|
||||
hyperlinks = webpage.select("a[href]")
|
||||
#This is where we store our connected pages
|
||||
hyperlink_connections = set()
|
||||
for hyperlink in hyperlinks:
|
||||
url = hyperlink["href"]
|
||||
# Format the URL into a proper URL
|
||||
if url.startswith("#"):
|
||||
continue
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url
|
||||
elif url.startswith("/"):
|
||||
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
|
||||
url = base_url + url
|
||||
elif not url.startswith("http"):
|
||||
continue
|
||||
url = url.split('#')[0]
|
||||
#Add to the link connection
|
||||
hyperlink_connections.add(url)
|
||||
# If we haven't visited this URL yet, add it to our list
|
||||
if url not in visited_urls:
|
||||
urls.append(url)
|
||||
|
||||
# Update the page's outgoing links
|
||||
index[current_url]['hyperlink_connections'] = hyperlink_connections
|
||||
pagerank_graph[current_url] = hyperlink_connections
|
||||
|
||||
crawl_count += 1
|
||||
print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}")
|
||||
|
||||
# Compute PageRank
|
||||
pagerank_scores = compute_pagerank(pagerank_graph)
|
||||
|
||||
""" This part is for saving the data to CSV files.
|
||||
However, if you don't want to save the data, you can remove/comment out this part.
|
||||
If you want to use a database, you can replace this part with a database connection.
|
||||
"""
|
||||
|
||||
with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ["url", "title", "description", "pagerank", "words"]
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for url, info in index.items():
|
||||
writer.writerow({
|
||||
'url': url,
|
||||
'title': info['title'],
|
||||
'description': info['description'],
|
||||
'pagerank': pagerank_scores.get(url, 0),
|
||||
'words': ', '.join(info['words'])
|
||||
})
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
# Start the crawling process
|
||||
sloth_bot()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
224
search/crawling/advanced_crawler.py
Normal file
224
search/crawling/advanced_crawler.py
Normal file
|
@ -0,0 +1,224 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from queue import Queue
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import threading
|
||||
from urllib.parse import urlparse
|
||||
import csv
|
||||
from indexing.advanced_indexing import index_page
|
||||
import sys
|
||||
import os
|
||||
# Add the root directory to sys.path
|
||||
# This is to be able to import modules from other directories (indexing and serving) idk why...
|
||||
# any imports from indexing/serving need to happen under this
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
||||
|
||||
# Function to check robots.txt for permission to crawl
|
||||
# If we don't do this, we could get blocked/banned
|
||||
# since we don't have permission to crawl.
|
||||
def can_crawl(url):
|
||||
parsed_url = urlparse(url)
|
||||
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
||||
print(f"Checking robots.txt for: {robots_url}")
|
||||
time.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
response = requests.get(robots_url, timeout=5)
|
||||
response.raise_for_status()
|
||||
disallowed_paths = []
|
||||
for line in response.text.splitlines():
|
||||
if line.startswith("Disallow"):
|
||||
parts = line.split()
|
||||
if len(parts) > 1:
|
||||
disallowed_paths.append(parts[1])
|
||||
for path in disallowed_paths:
|
||||
if urlparse(url).path.startswith(path):
|
||||
print(f"Disallowed by robots.txt: {url}")
|
||||
return False
|
||||
return True
|
||||
except requests.RequestException:
|
||||
print(f"Failed to access robots.txt: {robots_url}")
|
||||
return False # If we can't access robots.txt, assume we can't crawl (we're being nice here)
|
||||
|
||||
# Function to fetch and parse URL
|
||||
def crawl(args):
|
||||
queue = args['queue']
|
||||
visited_urls = args['visited_urls']
|
||||
crawl_count = args['crawl_count']
|
||||
CRAWL_LIMIT = args['CRAWL_LIMIT']
|
||||
lock = args['lock']
|
||||
index = args['index']
|
||||
webpage_info = args['webpage_info']
|
||||
webpage_id_counter = args['webpage_id_counter']
|
||||
stop_crawl = args['stop_crawl']
|
||||
|
||||
while not stop_crawl.is_set():
|
||||
try:
|
||||
current_url = queue.get(timeout=5)
|
||||
print("Time to crawl: " + current_url)
|
||||
except Exception:
|
||||
break # Exit if no more URLs are available to crawl
|
||||
|
||||
with lock:
|
||||
if crawl_count[0] >= CRAWL_LIMIT:
|
||||
queue.queue.clear() # Clear remaining URLs to stop processing
|
||||
print("Crawl limit reached. Exiting...")
|
||||
stop_crawl.set()
|
||||
break
|
||||
if current_url in visited_urls:
|
||||
queue.task_done()
|
||||
continue
|
||||
visited_urls.add(current_url)
|
||||
|
||||
""" Checks for noindex directive in the page
|
||||
Comment this out if you don't care about noindex
|
||||
WARNING: websites could block/ban you if you don't have permission
|
||||
"""
|
||||
# if not can_crawl(current_url):
|
||||
# queue.task_done()
|
||||
# continue
|
||||
|
||||
time.sleep(random.uniform(2, 5))
|
||||
try:
|
||||
response = requests.get(current_url, timeout=5)
|
||||
response.raise_for_status() # Check for request errors
|
||||
content = response.content
|
||||
|
||||
""" Checks for noindex directive in the page
|
||||
Comment this out if you don't care about noindex
|
||||
WARNING: websites could block/ban you if you don't have permission
|
||||
"""
|
||||
# if 'noindex' in content.decode('utf-8').lower():
|
||||
# print(f"Noindex found, skipping: {current_url}")
|
||||
# queue.task_done()
|
||||
# continue
|
||||
|
||||
|
||||
# Parse the fetched content to find new URLs
|
||||
webpage = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Index the webpage
|
||||
indexed_page = index_page(webpage, current_url)
|
||||
with lock:
|
||||
for word in indexed_page["words"]:
|
||||
if word not in index:
|
||||
index[word] = set()
|
||||
index[word].add(webpage_id_counter[0])
|
||||
webpage_info[webpage_id_counter[0]] = indexed_page
|
||||
webpage_id_counter[0] += 1
|
||||
|
||||
hyperlinks = webpage.select("a[href]")
|
||||
new_urls = parse_links(hyperlinks, current_url)
|
||||
|
||||
with lock:
|
||||
for new_url in new_urls:
|
||||
if new_url not in visited_urls:
|
||||
queue.put(new_url)
|
||||
crawl_count[0] += 1
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to fetch {current_url}: {e}")
|
||||
finally:
|
||||
queue.task_done()
|
||||
|
||||
# Function to parse links from HTML content
|
||||
def parse_links(hyperlinks, current_url):
|
||||
urls = []
|
||||
for hyperlink in hyperlinks:
|
||||
url = hyperlink["href"]
|
||||
|
||||
# Format the URL into a proper URL
|
||||
if url.startswith("#"):
|
||||
continue # Skip same-page anchors
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url # Add scheme to protocol-relative URLs
|
||||
elif url.startswith("/"):
|
||||
# Construct full URL for relative links
|
||||
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
|
||||
url = base_url + url
|
||||
elif not url.startswith("http"):
|
||||
continue # Skip non-HTTP links
|
||||
url = url.split("#")[0] # Remove anchor
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
# Main crawling function
|
||||
def sloth_bot():
|
||||
# Start with the initial pages to crawl
|
||||
starting_urls = [
|
||||
"https://www.wikipedia.org/wiki/Google",
|
||||
"https://www.bbc.com/news/world",
|
||||
"https://news.ycombinator.com/",
|
||||
]
|
||||
|
||||
urls_to_crawl = Queue()
|
||||
for seed_url in starting_urls:
|
||||
urls_to_crawl.put(seed_url)
|
||||
|
||||
visited_urls = set() # URL tracking
|
||||
CRAWL_LIMIT = 20 # Set crawl limit
|
||||
crawl_count = [0] # Shared counter
|
||||
lock = threading.Lock() # Thread safety lock
|
||||
index = {}
|
||||
webpage_info = {}
|
||||
webpage_id_counter = [0]
|
||||
stop_crawl = threading.Event()
|
||||
|
||||
# Start concurrent crawling with ThreadPoolExecutor
|
||||
#Concurrency = speed
|
||||
#Threads go BRRRRR
|
||||
#Increase this if you want more threads, but be careful with these.
|
||||
NUM_WORKERS = 100
|
||||
#Setting up arguments for the crawl function
|
||||
args = {
|
||||
'queue': urls_to_crawl,
|
||||
'visited_urls': visited_urls,
|
||||
'crawl_count': crawl_count,
|
||||
'CRAWL_LIMIT': CRAWL_LIMIT,
|
||||
'lock': lock,
|
||||
'index': index,
|
||||
'webpage_info': webpage_info,
|
||||
'webpage_id_counter': webpage_id_counter,
|
||||
'stop_crawl': stop_crawl
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
|
||||
for _ in range(NUM_WORKERS):
|
||||
executor.submit(crawl, args)
|
||||
|
||||
print("All URLs have been crawled")
|
||||
|
||||
|
||||
""" This part is for saving the data to CSV files.
|
||||
However, if you don't want to save the data, you can remove/comment out this part.
|
||||
If you want to use a database, you can replace this part with a database connection.
|
||||
"""
|
||||
with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['word', 'doc_ids']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for word, doc_ids in index.items():
|
||||
writer.writerow({'word': word, 'doc_ids': list(doc_ids)})
|
||||
|
||||
with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['doc_id', 'url', 'title', 'description']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for doc_id, info in webpage_info.items():
|
||||
writer.writerow({
|
||||
'doc_id': doc_id,
|
||||
'url': info['url'],
|
||||
'title': info['title'],
|
||||
'description': info['description']
|
||||
})
|
||||
|
||||
def main():
|
||||
# Start the crawling process
|
||||
sloth_bot()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
65
search/crawling/simple_crawler.py
Normal file
65
search/crawling/simple_crawler.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
|
||||
def sloth_bot():
|
||||
# our list of URLs to crawl
|
||||
urls = ["https://en.wikipedia.org/wiki/Google"]
|
||||
visited_urls = set()
|
||||
#timer to see how long it takes to crawl
|
||||
start = time.time()
|
||||
#Loops through the list of urls
|
||||
CRAWL_LIMIT = 15
|
||||
current_crawl_count = 0
|
||||
|
||||
while urls and current_crawl_count < CRAWL_LIMIT:
|
||||
# grabs the next url
|
||||
current_url = urls.pop(0)
|
||||
print("time to crawl: " + current_url)
|
||||
time.sleep(random.uniform(1, 3))
|
||||
try:
|
||||
response = requests.get(current_url)
|
||||
response.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print(f"Failed to retrieve {current_url}: {e}")
|
||||
continue
|
||||
|
||||
# grabbing the content of the page
|
||||
webpage = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# grabbing the links from the page
|
||||
hyperlinks = webpage.select("a[href]")
|
||||
# looping through the links and adding them to our list of urls
|
||||
for hyperlink in hyperlinks:
|
||||
url = hyperlink["href"]
|
||||
#Formats the url into a proper url (don't worry about this)
|
||||
if url.startswith("#"):
|
||||
continue
|
||||
if url.startswith("//"):
|
||||
url = "https:" + url
|
||||
elif url.startswith("/"):
|
||||
base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
|
||||
url = base_url + url
|
||||
elif not url.startswith("http"):
|
||||
continue
|
||||
#
|
||||
url = url.split('#')[0]
|
||||
|
||||
#if we haven't visited this url yet, add it to our list
|
||||
if url not in visited_urls:
|
||||
urls.append(url)
|
||||
visited_urls.add(url)
|
||||
|
||||
current_crawl_count += 1
|
||||
|
||||
|
||||
def main():
|
||||
# Start the crawling process
|
||||
sloth_bot()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
|
0
search/indexing/__init__.py
Normal file
0
search/indexing/__init__.py
Normal file
73
search/indexing/advanced_indexing.py
Normal file
73
search/indexing/advanced_indexing.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import nltk
|
||||
import ssl
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
try:
|
||||
_create_unverified_https_context = ssl._create_unverified_context
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
ssl._create_default_https_context = _create_unverified_https_context
|
||||
nltk.download('stopwords')
|
||||
nltk.download('punkt_tab')
|
||||
try:
|
||||
_create_unverified_https_context = ssl._create_unverified_context
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
ssl._create_default_https_context = _create_unverified_https_context
|
||||
|
||||
# Download NLTK data only if not already downloaded
|
||||
def download_nltk_resources():
|
||||
try:
|
||||
stopwords.words('english')
|
||||
except LookupError:
|
||||
nltk.download('stopwords')
|
||||
try:
|
||||
word_tokenize('test')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
#Function that indexes the webpage
|
||||
def advanced_index_page(webpage, webpage_url):
|
||||
#Download NLTK data only if not already downloaded
|
||||
download_nltk_resources()
|
||||
|
||||
# Initialize NLTK components
|
||||
stop_words = set(stopwords.words('english'))
|
||||
ps = PorterStemmer()
|
||||
#Collect title and description
|
||||
title_tag = webpage.find('title')
|
||||
title = title_tag.get_text().strip() if title_tag else 'No Title'
|
||||
|
||||
#Collect description
|
||||
description = ''
|
||||
meta_description = webpage.find('meta', attrs={'name': 'description'})
|
||||
if meta_description and 'content' in meta_description.attrs:
|
||||
description = meta_description['content']
|
||||
else:
|
||||
text_content = webpage.get_text(separator=" ", strip=True)
|
||||
description = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
||||
|
||||
|
||||
# Grab ALL the words in the page.
|
||||
text_content = webpage.get_text(separator=' ', strip=True)
|
||||
#Splitting them into the individual words
|
||||
tokens = word_tokenize(text_content.lower())
|
||||
#Big brain techniques 2 and 3
|
||||
#Stemming the words and removing stop words.
|
||||
filtered_words = [
|
||||
ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words
|
||||
]
|
||||
|
||||
#Add the information to the index
|
||||
indexed_page = {
|
||||
"url": webpage_url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"words": filtered_words
|
||||
}
|
||||
#If you want to print the results
|
||||
#print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n")
|
||||
return indexed_page
|
34
search/indexing/simple_indexing.py
Normal file
34
search/indexing/simple_indexing.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
import re
|
||||
|
||||
def simple_index_page(webpage, webpage_url):
|
||||
|
||||
#Collect title and description
|
||||
title_tag = webpage.find('title')
|
||||
title = title_tag.get_text().strip() if title_tag else 'No Title'
|
||||
|
||||
#Collect description
|
||||
description = ''
|
||||
meta_description = webpage.find('meta', attrs={'name': 'description'})
|
||||
if meta_description and 'content' in meta_description.attrs:
|
||||
description = meta_description['content']
|
||||
else:
|
||||
text_content = webpage.get_text(separator=" ", strip=True)
|
||||
description = text_content[:200] + "..." if len(text_content) > 200 else text_content
|
||||
|
||||
#Grab ALL the words in the page
|
||||
#regex disgusting...
|
||||
words = re.findall(r'\b\w+\b', webpage.get_text(separator=" ", strip=True).lower())
|
||||
|
||||
#Double check and filter out any numbers, symbols, etc.
|
||||
#WE ONLY WANT WORDS
|
||||
words = [word for word in words if word.isalpha()]
|
||||
|
||||
#Add the information to the index
|
||||
indexed_page = {
|
||||
"url": webpage_url,
|
||||
"title": title,
|
||||
"description": description,
|
||||
"words": words
|
||||
}
|
||||
print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(words)} \n")
|
||||
return indexed_page
|
34
search/serving/pagerank.py
Normal file
34
search/serving/pagerank.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
|
||||
|
||||
def compute_pagerank(graph, damping_factor=0.85, max_iterations=100, tol=1.0e-6):
|
||||
# Build the set of all URLs
|
||||
all_nodes = set(graph.keys())
|
||||
for links in graph.values():
|
||||
all_nodes.update(links)
|
||||
num_nodes = len(all_nodes)
|
||||
# Initialize PageRank scores
|
||||
pagerank = {url: 1.0 / num_nodes for url in all_nodes}
|
||||
# Identify dangling nodes (nodes with no outgoing links)
|
||||
dangling_nodes = [url for url in all_nodes if url not in graph or len(graph[url]) == 0]
|
||||
# Iterative computation
|
||||
for iteration in range(max_iterations):
|
||||
new_pagerank = {}
|
||||
# Sum of PageRank scores from dangling nodes
|
||||
dangling_sum = damping_factor * sum(pagerank[node] for node in dangling_nodes) / num_nodes
|
||||
for url in all_nodes:
|
||||
rank = (1.0 - damping_factor) / num_nodes
|
||||
rank += dangling_sum
|
||||
# Sum contributions from incoming links
|
||||
for node in graph:
|
||||
if url in graph[node]:
|
||||
out_degree = len(graph[node])
|
||||
rank += damping_factor * pagerank[node] / out_degree
|
||||
new_pagerank[url] = rank
|
||||
# Check for convergence
|
||||
error = sum(abs(new_pagerank[url] - pagerank[url]) for url in all_nodes)
|
||||
if error < tol:
|
||||
break
|
||||
pagerank = new_pagerank
|
||||
for url in all_nodes:
|
||||
pagerank[url] = round(pagerank[url], 6)
|
||||
return pagerank
|
136
server/google_search_api.py
Normal file
136
server/google_search_api.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
from flask import Flask, request, jsonify
|
||||
import csv
|
||||
import nltk
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
from nltk.tokenize import word_tokenize
|
||||
import ssl
|
||||
from flask_cors import CORS
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
CORS(app)
|
||||
|
||||
# NLTK setup (handles SSL certificate issues)
|
||||
try:
|
||||
_create_unverified_https_context = ssl._create_unverified_context
|
||||
except AttributeError:
|
||||
pass
|
||||
else:
|
||||
ssl._create_default_https_context = _create_unverified_https_context
|
||||
|
||||
# Download NLTK data only if not already downloaded
|
||||
def download_nltk_resources():
|
||||
try:
|
||||
stopwords.words('english')
|
||||
except LookupError:
|
||||
nltk.download('stopwords')
|
||||
try:
|
||||
word_tokenize('test')
|
||||
except LookupError:
|
||||
nltk.download('punkt')
|
||||
|
||||
# Initialize NLTK components
|
||||
download_nltk_resources()
|
||||
stop_words = set(stopwords.words('english'))
|
||||
ps = PorterStemmer()
|
||||
|
||||
|
||||
def load_inverted_index(file_path):
|
||||
inverted_index = {}
|
||||
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
word = row['word']
|
||||
doc_ids_str = row['doc_ids'].strip("[]") # Remove brackets
|
||||
doc_ids_list = doc_ids_str.split(', ') if doc_ids_str else []
|
||||
doc_ids = set(int(doc_id) for doc_id in doc_ids_list)
|
||||
inverted_index[word] = doc_ids
|
||||
return inverted_index
|
||||
|
||||
def load_document_info(file_path):
|
||||
document_info = {}
|
||||
with open(file_path, 'r', encoding='utf-8') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
doc_id = int(row['doc_id'])
|
||||
document_info[doc_id] = {
|
||||
'url': row['url'],
|
||||
'title': row['title'],
|
||||
'description': row['description'],
|
||||
'pagerank': float(row['pagerank'])
|
||||
}
|
||||
return document_info
|
||||
|
||||
def parse_query(query):
|
||||
# Tokenize the query
|
||||
tokens = word_tokenize(query.lower())
|
||||
# Remove non-alphabetic tokens and stop words, then stem the words
|
||||
query_words = [
|
||||
ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words
|
||||
]
|
||||
return query_words
|
||||
|
||||
def search(query, inverted_index, document_info, num_results=10, page=1):
|
||||
query_words = parse_query(query)
|
||||
if not query_words:
|
||||
return []
|
||||
# Find documents that contain any of the query words
|
||||
matched_doc_ids = set()
|
||||
for word in query_words:
|
||||
if word in inverted_index:
|
||||
matched_doc_ids.update(inverted_index[word])
|
||||
if not matched_doc_ids:
|
||||
return []
|
||||
# Retrieve documents and their PageRank scores
|
||||
results = []
|
||||
for doc_id in matched_doc_ids:
|
||||
info = document_info[doc_id]
|
||||
results.append({
|
||||
'doc_id': doc_id,
|
||||
'url': info['url'],
|
||||
'title': info['title'],
|
||||
'description': info['description'],
|
||||
'pagerank': info['pagerank']
|
||||
})
|
||||
# Sort documents by PageRank score
|
||||
sorted_results = sorted(results, key=lambda x: x['pagerank'], reverse=True)
|
||||
# Pagination
|
||||
start = (page - 1) * num_results
|
||||
end = start + num_results
|
||||
paginated_results = sorted_results[start:end]
|
||||
return paginated_results
|
||||
|
||||
# Load the inverted index and document info
|
||||
# If you are using a different file, replace the path with the path to your file
|
||||
#If you're using a database, replace this with the code to connect to your database
|
||||
try:
|
||||
inverted_index = load_inverted_index('../search/complete_examples/advanced_pagerank_inverted_index.csv')
|
||||
document_info = load_document_info('../search/complete_examples/advanced_pagerank.csv')
|
||||
except FileNotFoundError:
|
||||
try:
|
||||
inverted_index = load_inverted_index("../advanced_pagerank_inverted_index.csv")
|
||||
document_info = load_document_info("../advanced_pagerank.csv")
|
||||
except FileNotFoundError:
|
||||
print("Error: Files not found, run the advanced_pagerank.py file first")
|
||||
print("Exiting...")
|
||||
exit()
|
||||
|
||||
|
||||
@app.route('/search')
|
||||
def search_api():
|
||||
query = request.args.get('q', '')
|
||||
num_results = int(request.args.get('num_results', 10))
|
||||
page = int(request.args.get('page', 1))
|
||||
if not query:
|
||||
return jsonify({'error': 'No query provided'}), 400
|
||||
results = search(query, inverted_index, document_info, num_results=num_results, page=page)
|
||||
return jsonify({
|
||||
'query': query,
|
||||
'page': page,
|
||||
'num_results': num_results,
|
||||
'results': results
|
||||
})
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
Loading…
Add table
Reference in a new issue