Python script to calculate one's h-Index per year

Here’s a small script I wrote, using Python, scholarly and Google scholar, to calculate one’s h-Index per year (so you can see how it evolved over your life as a researcher). It tells me in what periods of my career I have been most active in producing most ‘impactful’ research output. I missed such capability in otherwise great tools for citation analysis like Harzing’s Publish or Perish.

Here you go:

# -*- coding: utf-8 -*-
"""
This is a script uses Google Scholar to calculate and plot the evolution of the 
h-index of a researcher over the years.

Author: Thilo Stadelmann (stdm@zhaw.ch)
Date: Sept. 20, 2021
Docs: https://scholarly.readthedocs.io/en/stable/quickstart.html#methods-for-scholar
"""

from scholarly import scholarly
from datetime import datetime
import matplotlib.pyplot as plt

#get all publications written by author together with their citations per year from Google Scholar
publications = []
min_year = 2100
current_year = int(datetime.now().year)
author = scholarly.search_author_id('6U6ZXzUAAAAJ') #put your author id here
scholarly.fill(author, sections=['publications'])
print('Retrieving data from Google Scholar for author', author['name'], end='')
for publication in author['publications']:
    scholarly.fill(publication)
    if not 'pub_year' in publication['bib']: #check if pub_year is missing in the Google data
        publication['bib']['pub_year'] = current_year
        print('x', end='')
    else:
        print('.', end='')
    publications.append({'title': publication['bib']['title'],
                         'year': int(publication['bib']['pub_year']),
                         'total_cites': int(publication['num_citations']),
                         'authors': publication['bib']['author'],
                         'cites_per_year': publication['cites_per_year']})
    if int(publication['bib']['pub_year']) < min_year:
        min_year = int(publication['bib']['pub_year'])
print('done')

#print(publications)
        
#calculate the h-index per year to analyse its evolution
h_per_year = {}
for year in range(min_year, current_year+1):
    h_per_year[year] = 0 
    relevant_cites = []
    #determine how many papers author has written up to (+including) year
    for publication in publications:
        sum_cites = 0
        if year == current_year: #check because the cites per year do not always add up to the total number of cites in the Google data
            relevant_cites.append(publication['total_cites'])
        else:
            if publication['year'] <= year:
                #...and how often these publications have been cited until that year
                cites_to_year = 0
                for y in publication['cites_per_year']:
                    if int(y) <= year:
                        cites_to_year = cites_to_year + int(publication['cites_per_year'][y])
                relevant_cites.append(cites_to_year)
    #calculate h
    relevant_cites.sort(reverse=True)
    #print(year, relevant_cites)
    for i in range(0, len(relevant_cites)):
        if relevant_cites[i] > h_per_year[year]:
            h_per_year[year] = i + 1
        else:
            break

plt.plot(h_per_year.keys(), h_per_year.values())
print(h_per_year)
Written on September 22, 2021 (last modified: November 16, 2021)
comments powered by Disqus