PSI4 on CentOS cluster

Negative on sending psi4 task to more than 1 node. We’re strictly intra-node parallel.

We’ve had a few issues with math libs and parallelism lately here and here. What’s the version of your binary (psi4 --version)? And does running this show speedup? And what’s the output of python thread.py and psi4 thread.py below?

thread.py

import os
import time

# none for psithon

# good psiapi
import numpy as np
import psi4

# bad psiapi
#import psi4
#import numpy as np

def test_threaded_blas():
    threads = 6

    times = {}

    size = [200, 500, 2000, 4000]
    threads = [1, threads]

    for th in threads:
        psi4.set_num_threads(th)

        for sz in size:
            nruns = max(1, int(1.e10 / (sz ** 3)))

            a = psi4.core.Matrix(sz, sz)
            b = psi4.core.Matrix(sz, sz)
            c = psi4.core.Matrix(sz, sz)

            tp4 = time.time()
            for n in range(nruns):
                c.gemm(False, False, 1.0, a, b, 0.0)

            retp4 = (time.time() - tp4) / nruns

            tnp = time.time()
            for n in range(nruns):
                np.dot(a, b, out=np.asarray(c))

            retnp = (time.time() - tnp) / nruns
            #retnp = 1.0
            print("Time for threads %2d, size %5d: Psi4: %12.6f  NumPy: %12.6f" % (th, sz, retp4, retnp))
            if sz == 4000:
                times["p4-n{}".format(th)] = retp4
                times["np-n{}".format(th)] = retnp
                assert psi4.get_num_threads() == th

    rat1 = times["np-n" + str(threads[-1])] / times["p4-n" + str(threads[-1])]
    rat2 = times["p4-n" + str(threads[0])] / times["p4-n" + str(threads[-1])]
    print("  NumPy@n%d : Psi4@n%d ratio (want ~1): %.2f" % (threads[-1], threads[-1], rat1))
    print("   Psi4@n%d : Psi4@n%d ratio (want ~%d): %.2f" % (threads[0], threads[-1], threads[-1], rat2))

    os.system('grep mkl /proc/%d/maps' % os.getpid())


if __name__ == '__main__':
    test_threaded_blas()