Source code for extract_proposals

import os
import re
import zipfile
import requests
import datetime
import argparse
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import yaml

from bs4 import BeautifulSoup
from xml.dom import minidom

import pdb

from helpers import get_formatted_date


[docs]class GrantsDataExtractor(object): """ Class which will extract data from the Grants.Gov website. As per design, we will first download the list of all Open proposals from the Grants.gov. Later for each proposal, further data is extracted from the dedicated webpage (for example from NSF website). """ def __init__(self, xml_url, csv_url, agencies, params): """ Constructor :param xml_url: The URL from which XML file is to be downloaded :type xml_url: `str` :param csv_url: The URL from which CSV file is to be downloaded :type csv_url: `str` :param agencies: List of agencies for which proposals are to be extracted :type agencies: `List` :param params: Deafult set of parameters read from the CONFIG.yml file :type params: `Dict` :return: None """ self.output_path = params['OUTPUT_PATH'] self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.output_path ) self.xml_url = params['XML_URL'] if xml_url == '' else xml_url self.csv_url = params['CSV_URL'] if csv_url == '' else csv_url self.agencies_filenames = params['AGENCIES_FILENAME_DICT'] self.agencies = params['AGENCIES'] if agencies == [] else agencies self.tags = params['TAGS'] self.open_proposal_filename = params["OPEN_PROPOSALS_DATASET"] self.grants_filename = params["GRANTS_DATASET"] self.grants_download_folder = params["GRANTS_DOWNLOAD_FOLDER"] self.grants_download_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.grants_download_folder ) self.grant_downloaded_csv_filename = params["GRANTS_DOWNLOAD_CSV_FILENAME"] if not os.path.exists(self.output_path): os.mkdir(self.output_path)
[docs] def ExtractCSVData(self): """ Function to extract data from the downloaded CSV file Once the data is extracted it will be saved as a dataframe - self.metadata :param None: :return: None """ response = requests.get(self.csv_url) if not os.path.exists( os.path.join( os.getcwd(), self.grants_download_folder)): os.mkdir(os.path.join(os.getcwd(), self.grants_download_folder)) open( os.path.join( os.getcwd(), self.grants_download_folder, self.grant_downloaded_csv_filename), 'wb').write( response.content) # Read data from CSV self.metadata = pd.read_csv( os.path.join( os.getcwd(), self.grants_download_folder, self.grant_downloaded_csv_filename), error_bad_lines=False, warn_bad_lines=False) column_names = self.metadata.columns self.metadata.reset_index(inplace=True) self.metadata = self.metadata.iloc[:, :len(column_names)] self.metadata.columns = column_names # Extract hyperlink self.metadata['URL'] = self.metadata['OPPORTUNITY NUMBER'].apply( lambda x: x.split('"')[1]) self.metadata['OPPORTUNITY NUMBER'] = self.metadata['OPPORTUNITY NUMBER'].apply( lambda x: x.split('"')[-2])
[docs] def ExtractXMLData(self): """ Function to extract data from the XML file. Once the data is extracted it will be saved as a dataframe - self.opps_df :param None: :return: None """ response = requests.request("GET", self.xml_url, headers={}, data={}) response_str = response.text soup = BeautifulSoup(response_str, "html.parser") zip_url = soup.findAll('a', href=True, text=re.compile( "GrantsDBExtract"))[-1]['href'] filename = zip_url.split('/')[-1] print("DOWNLOADING ZIP FILE FROM - ", zip_url) response = requests.get(zip_url) open( os.path.join( os.getcwd(), self.grants_download_folder, filename), 'wb').write( response.content) with zipfile.ZipFile(os.path.join(os.getcwd(), self.grants_download_folder, filename), 'r') as zip_ref: zip_ref.extractall( os.path.join( os.getcwd(), self.grants_download_folder)) doc = minidom.parse( os.path.join( os.getcwd(), self.grants_download_folder, filename.replace( '.zip', '') + '.xml')) opps = doc.getElementsByTagName("OpportunitySynopsisDetail_1_0") opp_list = [] for opp in tqdm(opps): dict_ = {} for tag in self.tags: try: dict_[tag] = opp.getElementsByTagName( tag)[0].firstChild.data except BaseException: dict_[tag] = '' opp_list.append(dict_) self.opps_df = pd.DataFrame(opp_list)
[docs] def ProcessXMLData(self): """ Function to process extracted the XML data. Reformat columns - CloseDate, PostDate. LastUpdateDate. Identify Open Proposals. :param None : :return: None """ # Reformate Columns self.opps_df['CloseDate'] = get_formatted_date( data=self.opps_df['CloseDate'], format_='%m%d%Y') self.opps_df['PostDate'] = get_formatted_date( data=self.opps_df['PostDate'], format_='%m%d%Y') self.opps_df['LastUpdatedDate'] = get_formatted_date( data=self.opps_df['LastUpdatedDate'], format_='%m%d%Y') self.data = pd.merge(self.opps_df, self.metadata[['OPPORTUNITY NUMBER', 'URL']], how='left', left_on='OpportunityNumber', right_on='OPPORTUNITY NUMBER') # Identify Open proposals self.open_df = self.data[( self.data['CloseDate'] > datetime.date.today())] self.open_df.reset_index(drop=True, inplace=True)
[docs] def SaveXMLData(self): """ Function to save all the XML Data to CSV files. Specifically, Open Proposals agency wise will be saved in seprate files. :param None : :return: None """ for agency in self.agencies: agency_dataset = self.open_df[self.open_df['AgencyName'] == agency] agency_dataset.to_csv( os.path.join( self.output_path, self.agencies_filenames[agency])) self.data.to_csv( os.path.join( self.output_path, self.grants_filename), index=False) self.open_df.to_csv( os.path.join( self.output_path, self.open_proposal_filename), index=False)
if __name__ == "__main__": # Read arguments from command line (cmd). If no input via cmd, use config # file parser = argparse.ArgumentParser(description="Parameter file") parser.add_argument( '--config_file', metavar='FILENAME', type=str, default='config.yml', help='Parameter file name in yaml format') parser.add_argument( '--xml_url', metavar='XML_URL', type=str, default='', help='URL to download the XML FILE') parser.add_argument( '--csv_url', metavar='XML_URL', type=str, default='', help='URL to download the CSV FILE') parser.add_argument( '--agencies', metavar='AGENCIES', type=list, default=[ 'National Science Foundation', 'National Institutes of Health'], help='List of agencies for which proposals are to be extracted') args = parser.parse_args() print("\n\nExtracting Proposals from Grants.gov") try: params = yaml.safe_load(open(args.config_file)) except BaseException: print(f'Error loading parameter file: {args.config_file}.') sys.exit(1) data_extractor = GrantsDataExtractor( xml_url=args.xml_url, csv_url=args.csv_url, agencies=args.agencies, params=params) data_extractor.ExtractCSVData() data_extractor.ExtractXMLData() data_extractor.ProcessXMLData() data_extractor.SaveXMLData() print("TASK COMPLETED : Successfully Extracted Proposals ..")