first commit

ubvu · Dec 9, 2024 · dda33ea · dda33ea
commit dda33ea
Show file tree

Hide file tree

Showing 6 changed files with 623 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/data
diff --git a/README.md b/README.md
@@ -0,0 +1,83 @@
+
+# Abstract Scraper
+
+`abstract-scraper` is a Python-based CLI tool for fetching abstracts of scientific articles from DOIs. It uses the [Pyalex](https://github.com/mattbierbaum/pyalex) library for accessing metadata from the OpenAlex API, allowing efficient and parallelized processing of large datasets.
+
+## Features
+- Fetches abstracts for scientific articles using their DOIs.
+- Supports parallel processing with configurable worker count.
+- Periodically saves progress to prevent data loss.
+- Simple and intuitive command-line interface (CLI).
+
+---
+
+## Installation
+
+### Prerequisites
+- Python 3.12 or later
+- [Poetry](https://python-poetry.org/) for dependency management
+
+### Steps
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/HuberNicolas/abstract_scraper
+   cd abstract-scraper
+   ```
+
+2. Install dependencies using Poetry:
+   ```bash
+   poetry install
+   ```
+
+---
+
+## Usage
+
+Run the script with the following command:
+```bash
+python -m main <input_file> <output_file> [--num_workers <int>] [--save_interval <int>]
+```
+
+### Required Arguments
+- `<input_file>`: Path to the CSV file containing a column `doi` with DOIs of the articles.
+- `<output_file>`: Path where the updated CSV with fetched abstracts will be saved.
+
+### Optional Arguments
+- `--num_workers`: Number of parallel workers to use (default: 4).
+- `--save_interval`: Save progress after processing this many rows (default: 50).
+
+### Example
+```bash
+poetry shell # activate poetry env
+python -m main data/sdg_data.csv data/sdg_data_abstracts.csv --num_workers 2 --save_interval 10
+```
+
+---
+
+## Input File Format
+
+The input CSV file must include a `doi` column with the DOIs of the articles. Example:
+
+| doi                   |
+|-----------------------|
+| 10.1234/example-doi-1 |
+| 10.5678/example-doi-2 |
+
+---
+
+## Output File
+
+The script saves the output CSV with the following additional column:
+- **`abstract`**: Contains the fetched abstract for each article.
+
+Example output:
+| doi                   | abstract                                |
+|-----------------------|-----------------------------------------|
+| 10.1234/example-doi-1 | This is an example abstract.           |
+| 10.5678/example-doi-2 | Another example abstract.              |
+
+---
+
+## License
+
+This project is licensed under the MIT License. See the LICENSE file for details.
diff --git a/__pycache__/main.cpython-312.pyc b/__pycache__/main.cpython-312.pyc
diff --git a/main.py b/main.py
@@ -0,0 +1,94 @@
+import pandas as pd
+from pyalex import Works
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+import argparse
+
+
+def process_row(index, row):
+    """
+    Process a single row: fetch abstract.
+    """
+    result = {"index": index, "abstract": None}
+
+    # Skip rows where abstract is already present
+    if pd.notnull(row.get('abstract', None)):
+        print(f"Skip row {index}")
+        return result
+
+
+    doi = row.get('doi', None)
+    if not doi:
+        return result
+
+    doi_url = f"https://doi.org/{doi}"
+    try:
+        # Fetch work details using Pyalex
+        work = Works()[doi_url]
+
+        # Extract abstract
+        result["abstract"] = work["abstract"] if work["abstract"] else None
+
+    except Exception as e:
+        print(f"Could not extract abstract for DOI {doi}: {e}")
+
+    return result
+
+
+def main(input_file, output_file, num_workers, save_interval):
+    # Read the input CSV
+    df = pd.read_csv(input_file)
+
+    # Add 'abstract' column if not already present
+    if 'abstract' not in df.columns:
+        df['abstract'] = None
+
+    progress_count = 0  # Counter for processed rows
+
+    # Parallel processing
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {
+            executor.submit(process_row, index, row): index
+            for index, row in df.iterrows()
+        }
+
+        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
+            try:
+                # Retrieve the result
+                result = future.result()
+                index = result["index"]
+                abstract = result["abstract"]
+
+                # Update the DataFrame
+                if abstract:
+                    df.at[index, 'abstract'] = abstract
+
+                # Increment the progress counter
+                progress_count += 1
+
+                # Save progress periodically
+                if progress_count % save_interval == 0:
+                    print(f"Saving progress at {progress_count} processed rows...")
+                    df.to_csv(output_file, index=False)
+
+            except Exception as e:
+                print(f"Error during processing: {e}")
+
+    # Final save after all rows are processed
+    df.to_csv(output_file, index=False)
+    print(f"Processing complete. Results saved to '{output_file}'.")
+
+
+if __name__ == "__main__":
+    # Set up CLI arguments
+    parser = argparse.ArgumentParser(description="Process DOIs to fetch abstracts and save periodically.")
+    parser.add_argument("input_file", type=str, help="Path to the input CSV file.")
+    parser.add_argument("output_file", type=str, help="Path to the output CSV file.")
+    parser.add_argument("--num_workers", type=int, default=4, help="Number of parallel workers (default: 4).")
+    parser.add_argument("--save_interval", type=int, default=50, help="Save progress after this many rows (default: 50).")
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Run the main function with parsed arguments
+    main(args.input_file, args.output_file, args.num_workers, args.save_interval)