Try wget https://bash.commongrounds.cc/uploads/1759099453_irs.sh from the console
#!/bin/bash
# This script takes in XML files from IRS 990 database and exports all the files if unzipped into a single csv file.
# Output CSV file
output="all_xml_data.csv"
# Write header
echo "filename,ein,org_name,address,city,state,zip,tax_period_end,total_revenue,total_expenses,net_assets_eoy" > "$output"
# Find all XML files recursively and process them
find . -type f -name "*_public.xml" -print0 | while IFS= read -r -d '' file; do
python3 - <<EOF >> "$output"
import xml.etree.ElementTree as ET
import os
file_path = '$file'
filename = os.path.basename(file_path)
try:
tree = ET.parse(file_path)
root = tree.getroot()
# Handle namespaces if present
if root.tag.startswith('{'):
uri = root.tag.split('}')[0][1:]
ns = {'irs': uri}
else:
ns = {}
def find_text(tag):
path = f'.//irs:{tag}' if 'irs' in ns else f'.//{tag}'
elem = root.find(path, ns)
return elem.text.strip() if elem is not None and elem.text else ''
# Get return type
return_type_tag = 'ReturnTypeCd'
return_type_path = f'.//irs:{return_type_tag}' if 'irs' in ns else f'.//{return_type_tag}'
return_type_elem = root.find(return_type_path, ns)
return_type = return_type_elem.text.strip() if return_type_elem else ''
# Determine financial tags based on return type
if return_type == '990':
rev_tag = 'CYTotalRevenueAmt'
exp_tag = 'CYTotalExpensesAmt'
net_tag = 'NetAssetsOrFundBalancesEOYAmt'
elif return_type == '990EZ':
rev_tag = 'TotalRevenueAmt'
exp_tag = 'TotalExpensesAmt'
net_tag = 'NetAssetsOrFundBalancesEOYAmt'
else:
rev_tag = 'TotalRevenueAmt' # Fallback
exp_tag = 'TotalExpensesAmt'
net_tag = 'NetAssetsOrFundBalancesEOYAmt'
ein = find_text('EIN')
# Organization name (concat Line1 and Line2)
name1 = find_text('BusinessNameLine1Txt')
name2 = find_text('BusinessNameLine2Txt')
org_name = name1 + (' ' + name2 if name2 else '')
# Address (concat Line1 and Line2)
addr1 = find_text('AddressLine1Txt')
addr2 = find_text('AddressLine2Txt')
address = addr1 + (' ' + addr2 if addr2 else '')
city = find_text('CityNm')
state = find_text('StateAbbreviationCd')
zipcode = find_text('ZIPCd')
tax_period = find_text('TaxPeriodEndDt')
total_revenue = find_text(rev_tag)
total_expenses = find_text(exp_tag)
net_assets = find_text(net_tag)
# Escape double quotes for CSV (do not remove commas, as fields are quoted)
def esc(s):
return s.replace('"', '""')
print(f'"{esc(filename)}","{ein}","{esc(org_name)}","{esc(address)}","{esc(city)}","{state}","{zipcode}","{tax_period}","{total_revenue}","{total_expenses}","{net_assets}"')
except Exception as e:
print(f'"{filename}","","","","","","","","","",""') # Output empty row with filename on error
EOF
done
BASH to Home