Python script to calculate one's h-Index per year
Here’s a small script I wrote, using Python, scholarly and Google scholar, to calculate one’s h-Index per year (so you can see how it evolved over your life as a researcher). It tells me in what periods of my career I have been most active in producing most ‘impactful’ research output. I missed such capability in otherwise great tools for citation analysis like Harzing’s Publish or Perish.
Here you go:
# -*- coding: utf-8 -*-
"""
This is a script uses Google Scholar to calculate and plot the evolution of the
h-index of a researcher over the years.
Author: Thilo Stadelmann (stdm@zhaw.ch)
Date: Sept. 20, 2021
Docs: https://scholarly.readthedocs.io/en/stable/quickstart.html#methods-for-scholar
"""
from scholarly import scholarly
from datetime import datetime
import matplotlib.pyplot as plt
#get all publications written by author together with their citations per year from Google Scholar
publications = []
min_year = 2100
current_year = int(datetime.now().year)
author = scholarly.search_author_id('6U6ZXzUAAAAJ') #put your author id here
scholarly.fill(author, sections=['publications'])
print('Retrieving data from Google Scholar for author', author['name'], end='')
for publication in author['publications']:
scholarly.fill(publication)
if not 'pub_year' in publication['bib']: #check if pub_year is missing in the Google data
publication['bib']['pub_year'] = current_year
print('x', end='')
else:
print('.', end='')
publications.append({'title': publication['bib']['title'],
'year': int(publication['bib']['pub_year']),
'total_cites': int(publication['num_citations']),
'authors': publication['bib']['author'],
'cites_per_year': publication['cites_per_year']})
if int(publication['bib']['pub_year']) < min_year:
min_year = int(publication['bib']['pub_year'])
print('done')
#print(publications)
#calculate the h-index per year to analyse its evolution
h_per_year = {}
for year in range(min_year, current_year+1):
h_per_year[year] = 0
relevant_cites = []
#determine how many papers author has written up to (+including) year
for publication in publications:
sum_cites = 0
if year == current_year: #check because the cites per year do not always add up to the total number of cites in the Google data
relevant_cites.append(publication['total_cites'])
else:
if publication['year'] <= year:
#...and how often these publications have been cited until that year
cites_to_year = 0
for y in publication['cites_per_year']:
if int(y) <= year:
cites_to_year = cites_to_year + int(publication['cites_per_year'][y])
relevant_cites.append(cites_to_year)
#calculate h
relevant_cites.sort(reverse=True)
#print(year, relevant_cites)
for i in range(0, len(relevant_cites)):
if relevant_cites[i] > h_per_year[year]:
h_per_year[year] = i + 1
else:
break
plt.plot(h_per_year.keys(), h_per_year.values())
print(h_per_year)
Written on September 22, 2021 (last modified: November 16, 2021)