Source code for extract_proposals

import os
import re
import zipfile
import requests
import datetime
import argparse
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import yaml

from bs4 import BeautifulSoup
from xml.dom import minidom

import pdb

from helpers import get_formatted_date


[docs]class GrantsDataExtractor(object):
    """ Class which will extract data from the Grants.Gov website.

    As per design, we will first download the list of all Open proposals from the Grants.gov.
    Later for each proposal, further data is extracted from the dedicated webpage (for example from NSF website).
        
    """

    def __init__(self, xml_url, csv_url, agencies, params):
        """ Constructor

        :param xml_url: The URL from which XML file is to be downloaded
        :type xml_url: `str`
        :param csv_url: The URL from which CSV file is to be downloaded
        :type csv_url: `str`
        :param agencies: List of agencies for which proposals are to be extracted
        :type agencies: `List`
        :param params: Deafult set of parameters read from the CONFIG.yml file
        :type params: `Dict`
        
        :return: None
        """
        self.output_path = params['OUTPUT_PATH']
        self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.output_path )
        self.xml_url = params['XML_URL'] if xml_url == '' else xml_url
        self.csv_url = params['CSV_URL'] if csv_url == '' else csv_url
        self.agencies_filenames = params['AGENCIES_FILENAME_DICT']
        self.agencies = params['AGENCIES'] if agencies == [] else agencies
        self.tags = params['TAGS']
        self.open_proposal_filename = params["OPEN_PROPOSALS_DATASET"]
        self.grants_filename = params["GRANTS_DATASET"]
        self.grants_download_folder = params["GRANTS_DOWNLOAD_FOLDER"]
        self.grants_download_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.grants_download_folder )
        self.grant_downloaded_csv_filename = params["GRANTS_DOWNLOAD_CSV_FILENAME"]
        
        if not os.path.exists(self.output_path):
            os.mkdir(self.output_path)

[docs]    def ExtractCSVData(self):
        """ Function to extract data from the downloaded CSV file 
        Once the data is extracted it will be saved as a dataframe - self.metadata
        
        :param None: 
        
        :return: None
        """

        response = requests.get(self.csv_url)

        if not os.path.exists(
            os.path.join(
                os.getcwd(),
                self.grants_download_folder)):
            os.mkdir(os.path.join(os.getcwd(), self.grants_download_folder))

        open(
            os.path.join(
                os.getcwd(),
                self.grants_download_folder,
                self.grant_downloaded_csv_filename),
            'wb').write(
            response.content)

        # Read data from CSV
        self.metadata = pd.read_csv(
            os.path.join(
                os.getcwd(),
                self.grants_download_folder,
                self.grant_downloaded_csv_filename),
            error_bad_lines=False,
            warn_bad_lines=False)

        column_names = self.metadata.columns
        self.metadata.reset_index(inplace=True)
        self.metadata = self.metadata.iloc[:, :len(column_names)]
        self.metadata.columns = column_names

        # Extract hyperlink
        self.metadata['URL'] = self.metadata['OPPORTUNITY NUMBER'].apply(
            lambda x: x.split('"')[1]) 
        self.metadata['OPPORTUNITY NUMBER'] = self.metadata['OPPORTUNITY NUMBER'].apply(
            lambda x: x.split('"')[-2])

[docs]    def ExtractXMLData(self):
        """ Function to extract data from the XML file.
        Once the data is extracted it will be saved as a dataframe - self.opps_df
        
        :param None: 
        
        :return: None
        """

        response = requests.request("GET", self.xml_url, headers={}, data={})
        response_str = response.text
        soup = BeautifulSoup(response_str, "html.parser")

        zip_url = soup.findAll('a', href=True, text=re.compile(
            "GrantsDBExtract"))[-1]['href']
        filename = zip_url.split('/')[-1]
        print("DOWNLOADING ZIP FILE FROM - ", zip_url)
        response = requests.get(zip_url)
        open(
            os.path.join(
                os.getcwd(),
                self.grants_download_folder,
                filename),
            'wb').write(
            response.content)

        with zipfile.ZipFile(os.path.join(os.getcwd(), self.grants_download_folder, filename), 'r') as zip_ref:
            zip_ref.extractall(
                os.path.join(
                    os.getcwd(),
                    self.grants_download_folder))
        doc = minidom.parse(
            os.path.join(
                os.getcwd(),
                self.grants_download_folder,
                filename.replace(
                    '.zip',
                    '') + '.xml'))

        opps = doc.getElementsByTagName("OpportunitySynopsisDetail_1_0")

        opp_list = []
        for opp in tqdm(opps):
            dict_ = {}
            for tag in self.tags:
                try:
                    dict_[tag] = opp.getElementsByTagName(
                        tag)[0].firstChild.data
                except BaseException:
                    dict_[tag] = ''
            opp_list.append(dict_)

        self.opps_df = pd.DataFrame(opp_list)

[docs]    def ProcessXMLData(self):
        """ Function to process extracted the XML data.
        Reformat columns - CloseDate, PostDate. LastUpdateDate.
        Identify Open Proposals.

        :param None : 
        
        :return: None
        """

        # Reformate Columns
        self.opps_df['CloseDate'] = get_formatted_date(
            data=self.opps_df['CloseDate'], format_='%m%d%Y')
        self.opps_df['PostDate'] = get_formatted_date(
            data=self.opps_df['PostDate'], format_='%m%d%Y')
        self.opps_df['LastUpdatedDate'] = get_formatted_date(
            data=self.opps_df['LastUpdatedDate'], format_='%m%d%Y')

        self.data = pd.merge(self.opps_df,
                             self.metadata[['OPPORTUNITY NUMBER',
                                            'URL']],
                             how='left',
                             left_on='OpportunityNumber',
                             right_on='OPPORTUNITY NUMBER')

        # Identify Open proposals
        self.open_df = self.data[(
            self.data['CloseDate'] > datetime.date.today())]
        self.open_df.reset_index(drop=True, inplace=True)

[docs]    def SaveXMLData(self):
        """ Function to save all the XML Data to CSV files. Specifically, Open Proposals agency wise will be saved in seprate files.

        :param None : 
        
        :return: None
        """

        for agency in self.agencies:
            agency_dataset = self.open_df[self.open_df['AgencyName'] == agency]
            agency_dataset.to_csv(
                os.path.join(
                    self.output_path,
                    self.agencies_filenames[agency]))

        self.data.to_csv(
            os.path.join(
                self.output_path,
                self.grants_filename),
            index=False)
        self.open_df.to_csv(
            os.path.join(
                self.output_path,
                self.open_proposal_filename),
            index=False)

if __name__ == "__main__":

    # Read arguments from command line (cmd). If no input via cmd, use config
    # file
    parser = argparse.ArgumentParser(description="Parameter file")
    parser.add_argument(
        '--config_file',
        metavar='FILENAME',
        type=str,
        default='config.yml',
        help='Parameter file name in yaml format')
    parser.add_argument(
        '--xml_url',
        metavar='XML_URL',
        type=str,
        default='',
        help='URL to download the XML FILE')
    parser.add_argument(
        '--csv_url',
        metavar='XML_URL',
        type=str,
        default='',
        help='URL to download the CSV FILE')
    parser.add_argument(
        '--agencies',
        metavar='AGENCIES',
        type=list,
        default=[
            'National Science Foundation',
            'National Institutes of Health'],
        help='List of agencies for which proposals are to be extracted')
    args = parser.parse_args()
    
    print("\n\nExtracting Proposals from Grants.gov")

    try:
        params = yaml.safe_load(open(args.config_file))
    except BaseException:
        print(f'Error loading parameter file: {args.config_file}.')
        sys.exit(1)

    data_extractor = GrantsDataExtractor(
        xml_url=args.xml_url,
        csv_url=args.csv_url,
        agencies=args.agencies,
        params=params)
    data_extractor.ExtractCSVData()
    data_extractor.ExtractXMLData()
    data_extractor.ProcessXMLData()
    data_extractor.SaveXMLData()
    
    print("TASK COMPLETED : Successfully Extracted Proposals ..")