近期工作的一些问题记录。
1 包管理 库的管理搜索
1 2 3 apt list --install | grep apt search packageKeyWord whereis libscalapack-openmpi
默认的动态库搜索路径是/usr/lib和/lib,ls /etc/ld.so.conf.d/可查看链接配置文件,其中可能会有搜索路径配置,例如WSL上 cat /etc/ld.so.conf.d/x86_64-linux-gnu.conf ,有:
1 2 3 4 # Multiarch support /usr/local/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu /usr/lib/x86_64-linux-gnu
说明/usr/local/lib/x86_64-linux-gnu也在动态库搜索路径中。
静态库的默认搜索路径是/usr/local/lib和/lib。和动态库不同,因为动态库是全局的所以分为了/lib和/usr/lib这样系统/用户两个路径,然后通过ld.so.conf以及LD_LIBRARY_PATH补充,而静态库通过-L指定路径。
mpi的编译器
mpif90是一个包装器,可以检查mpi指定的底层编译器是哪一个:
1 2 3 4 mpif90 --showme:command gfortran mpif90 --showme:link -I/usr/lib/x86_64-linux-gnu/openmpi/lib -L/usr/lib/x86_64-linux-gnu/openmpi/lib/fortran/gfortran -lmpi_usempif08 -lmpi_usempi_ignore_tkr -lmpi_mpifh -lmpi -lopen-rte -lopen-pal -lhwloc -levent_core -levent_pthreads -lm -lz
Scalapack链接
之前在本地环境是直接apt install了scalapack-openmpi并直接链接了动态库版本,可以直接find_package找到,在集群上有点差别。首先检查module是否提供了软件环境,直接load:
1 2 module load mathlib/lapack/gnu/3.8.0 module load mathlib/openblas/gnu/0.3.7
load之后不一定能链接的上,因为scalapack的.cmake没有被装到cmake能自动查找到的路径,所以需要手动搜索一下.cmake文件在哪里,然后加入到cmakelists当中:
1 set (CMAKE_PREFIX_PATH "/public/software/mathlib/scalapack/GNU/7.2.1" CACHE PATH "Path to ScaLAPACK installation" )
2 进程被kill问题排查 本地运行DB-SpGEMM进程被kill,定位信息。
1 2 dmesg -T | egrep -i 'killed process' [Wed May 21 11:21:39 2025] Out of memory: Killed process 2358 (TRS4example) total-vm:8959856kB, anon-rss:6007352kB, file-rss:0kB, shmem-rss:464kB, UID:1000 pgtables:15864kB oom_score_adj:0是什么意思
可以确认是OOM导致被kill的,进一步用mallocinfo去确认哪里触发了OOM(valgrind未排查出问题):
1 2 3 4 5 6 struct mallinfo2 mi_before , mi_after ; mi_before = mallinfo2(); mi_after = mallinfo2(); if (matA.process_grid->global_rank==0 ) printf ("I=%d J=%d Heap alloc delta: %zu bytes\n" , I, J, mi_after.uordblks - mi_before.uordblks);
最终定位到是InverseRootSquare-Matmul一个循环中有一个多层vector的insert导致发生了OOM,程序正常运行的话,从函数返回会正常析构,但计算1024个原子的时候就会在这个循环中OOM导致MPI进程被kill,确认数据块使用完毕后将vector清空就可以了:
1 vector <double >().swap( LocalSliceMat[I][J] );
3 链接问题 编写的静态库find_package找到了需要使用的blas库,但是静态库不会携带依赖的动态库信息,所以外部链接该库时不会自动链接该blas,但是程序能正常运行,说明有默认的blas库会被加载(或者静态库直接被编译到了静态库中),这导致了性能有极大的问题,但只要在应用编译时显示链接需要的blas库就能解决。
该问题在测试时没有注意到是因为库的同一工程下的测试程序会因为CMake的目标传播机制,自动完成需要的库的链接:
1 2 3 4 find_package (BLAS REQUIRED)add_library (dbspgemmTRS4 STATIC ...)target_link_libraries (dbspgemmTRS4 PRIVATE ${BLAS_LIBRARIES} )target_link_libraries (example PRIVATE dbspgemmTRS4)
如果单独使用.a,所有需要的动态库都必须明确显示链接。
4 TSR4 in NTPoly Input output
1 2 3 4 5 6 7 8 SUBROUTINE TRS4(H, ISQ, trace, K, energy_value_out, chemical_potential_out, solver_parameters_in) TYPE (Matrix_ps), INTENT (IN ) :: H TYPE (Matrix_ps), INTENT (IN ) :: ISQ REAL (NTREAL), INTENT (IN ) :: trace TYPE (Matrix_ps), INTENT (INOUT) :: K REAL (NTREAL), INTENT (OUT ), OPTIONAL :: energy_value_out REAL (NTREAL), INTENT (OUT ), OPTIONAL :: chemical_potential_out TYPE (SolverParameters_t), INTENT (IN ), OPTIONAL :: solver_parameters_in
initilize
1 2 3 4 5 6 IF (PRESENT (solver_parameters_in)) THEN CALL CopySolverParameters(solver_parameters_in, params) ELSE CALL ConstructSolverParameters(params) END IF
convergence monitor
1 2 CALL ConstructMonitor(params%monitor, automatic_in=params%monitor_convergence, tight_cutoff_in=params%converge_diff)
相似变换
1 2 3 4 5 6 7 8 9 CALL ConstructEmptyMatrix(K, H) CALL ConstructEmptyMatrix(WH, H) CALL FillMatrixIdentity(IMat) CALL TransposeMatrix(ISQ, ISQT)CALL SimilarityTransform(H, ISQ, ISQT, WH, pool, threshold_in=params%threshold)
1 2 3 4 5 IF (params%do_load_balancing) THEN CALL PermuteMatrix(WH, WH, params%BalancePermutation, memorypool_in=pool) CALL PermuteMatrix(IMat, IMat, params%BalancePermutation, memorypool_in=pool) END IF
特征值范围与矩阵初始化X_k
1 2 3 4 5 6 7 8 CALL GershgorinBounds(WH, e_min, e_max)CALL CopyMatrix(WH, X_k)CALL ScaleMatrix(X_k, -1.0_NTREAL ) CALL IncrementMatrix(IMat, X_k, alpha_in=e_max) CALL ScaleMatrix(X_k, 1.0_NTREAL /(e_max - e_min))
TSR4主循环
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 DO II = 1 , params%max_iterations CALL MatrixMultiply(X_k, X_k, X_k2, threshold_in=params%threshold, memory_pool_in=pool) CALL CopyMatrix(X_k2, Fx_right) CALL ScaleMatrix(Fx_right, -3.0_NTREAL ) CALL IncrementMatrix(X_k, Fx_right, alpha_in=4.0_NTREAL ) CALL CopyMatrix(IMat, Gx_right) CALL IncrementMatrix(X_k, Gx_right, alpha_in=-2.0_NTREAL ) CALL IncrementMatrix(X_k2, Gx_right) CALL DotMatrix(X_k2, Fx_right, trace_fx) CALL DotMatrix(X_k2, Gx_right, trace_gx) IF (ABS (trace_gx) < 1.0e-14_NTREAL ) THEN sigma_array(II) = 0.5 *(sigma_max - sigma_min) ELSE sigma_array(II) = (trace - trace_fx)/trace_gx END IF IF (sigma_array(II) > sigma_max) THEN CALL CopyMatrix(X_k, TempMat) CALL ScaleMatrix(TempMat, 2.0_NTREAL ) CALL IncrementMatrix(X_k2, TempMat, alpha_in=-1.0_NTREAL ) ELSE IF (sigma_array(II) < sigma_min) THEN CALL CopyMatrix(X_k2, TempMat) ELSE CALL ScaleMatrix(Gx_right, sigma_array(II)) CALL IncrementMatrix(Fx_right, Gx_right) CALL MatrixMultiply(X_k2, Gx_right, TempMat, threshold_in=params%threshold, memory_pool_in=pool) END IF CALL IncrementMatrix(TempMat, X_k, alpha_in=-1.0_NTREAL ) CALL CopyMatrix(TempMat, X_k) CALL DotMatrix(X_k, WH, energy_value) CALL AppendValue(params%monitor, energy_value - energy_value_old) IF (CheckConverged(params%monitor, params%be_verbose)) EXIT END DO
反变换与化学势计算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 IF (params%do_load_balancing) THEN CALL UndoPermuteMatrix(X_k, X_k, params%BalancePermutation, memorypool_in=pool) END IF CALL SimilarityTransform(X_k, ISQT, ISQ, K, pool, threshold_in=params%threshold) IF (PRESENT (chemical_potential_out)) THEN interval_a = 0.0_NTREAL interval_b = 1.0_NTREAL midpoint = 0.0_NTREAL midpoints: DO II = 1 , params%max_iterations midpoint = (interval_b - interval_a) / 2.0_NTREAL + interval_a zero_value = midpoint polynomial: DO JJ = 1 , total_iterations IF (sigma_array(JJ) .GT. sigma_max) THEN zero_value = 2.0_NTREAL * zero_value - zero_value*zero_value ELSE IF (sigma_array(JJ) .LT. sigma_min) THEN zero_value = zero_value * zero_value ELSE tempfx = (zero_value * zero_value) * & & (4.0_NTREAL * zero_value - & & 3.0_NTREAL * zero_value * zero_value) tempgx = (zero_value*zero_value) * (1.0_NTREAL - zero_value) & & * (1.0_NTREAL - zero_value) zero_value = tempfx + sigma_array(JJ) * tempgx END IF END DO polynomial IF (zero_value .LT. 0.5_NTREAL ) THEN interval_a = midpoint ELSE interval_b = midpoint END IF IF (ABS (zero_value-0.5_NTREAL ) .LT. params%converge_diff) THEN EXIT END IF END DO midpoints chemical_potential_out = e_max + (e_min - e_max) * midpoint END IF
5 逆平方根实现 NTPoly wiki 给出的Inverse Suare Root的实现方法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 def ComputeSqrt (InputMatrix ): matrix_dimension = InputMatrix.shape[0 ] identity_matrix = identity(matrix_dimension) min_val, max_val = Helper.EigenCircle(InputMatrix) lamda = 1 /max_val Yk = InputMatrix.copy() Zk = identity_matrix.copy() for i in range (0 ,1000 ): Xk = lamda*Yk.dot(Zk) Tk = 0.5 *(3.0 *identity_matrix - Xk) Ykplusone = Tk.dot(Yk) norm_value = norm(Yk - Ykplusone) Zk = Zk.dot(Tk) Yk = Ykplusone.copy() if (norm_value < 1e-10 ): break return_mat = sqrt(lamda)*Yk inverse_return_mat = sqrt(lamda)*Zk
根据NTPOLY的调用链,norm计算的是列和范数,即每一列所有的绝对值进行累加,得到每列的L1范数,然后获取所有列的最大L1范数,作为判断是否收敛的标准。
对于12000*12000的矩阵,分发时间占比最大。
6.1 SCALAPACK 2 DBSpGEMM 6.1.1 接口说明 首先要有一个scalapack的ictxt和desc。
转换的第一步是根据矩阵形状获取对齐保证进程均分矩阵的虚拟矩阵大小,然后根据这个建立DB-SpGEMM的进程网格:
1 2 int target_block_size = atoi(argv[3 ]), total_blocks, aligned_mtx_size;ProcessGrid pg;
获取填充矩阵整体大小的函数为getBlocks:
1 2 3 4 5 6 void getBlocks (int N, int process_rows, int process_columns, int spgemm_block_size, int &aligned_mtx_size, int &total_blocks) ;
使用getBlocks获取的对齐后的矩阵大小来创建进程网格,并构造一个空矩阵用于转换,为了支持slcie>1的场景,nproc_row nproc_col slice必须与总进程数相同,nproc_row和nproc_col在支持slice=1后,可以与原网格不同。目前场景下,转换前后的网格暂时必须保持完全一致。
1 2 pg.ConstructProcessGrid(MPI_COMM_WORLD, nproc_row, nproc_col, 1 , total_blocks); DistBlockMat distblockmat (aligned_mtx_size, &pg) ;
格式转换函数如下:
1 2 3 4 5 6 7 8 9 void formatTransform (int ictxt, int *desc_origin , double *local_origin, int myrow, int mycol, int spgemm_block_size, int slice, ProcessGrid *pg, DistBlockMat &distblockmat, int total_blocks) ;
6.1.2 实现 formatTransform内部会创建一个scalapack网格来完成矩阵数据的分发,这个网格对应的矩阵大小是原矩阵大小,但是进程块的行数列数是按照SpGEMM的格式计算的对齐可均分的大小,部分进程将得到不完整的块或没有数据块。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 int desc_trans[9 ], trans_rows, trans_cols, mtx_size, info, ZERO = 0 , ONE = 1 ; mtx_size = desc_origin[3 ]; trans_rows = pg->num_block_rows * spgemm_block_size; trans_cols = pg->num_block_columns * spgemm_block_size; #ifdef LOG printf ("mtx_size = %d trans_rows= %d transcols=%d process(%d %d) \n" , mtx_size, trans_rows, trans_cols, myrow, mycol); #endif descinit_(desc_trans, &mtx_size, &mtx_size, &trans_rows, &trans_cols, &ZERO, &ZERO, &ictxt, &trans_rows, &info); double *local_buffer = (double *)calloc (trans_rows * trans_cols, sizeof (double )); int ia = 1 , ja = 1 , ib = 1 , jb = 1 ; pdgemr2d_(&mtx_size, &mtx_size, local_origin, &ia, &ja, desc_origin, local_buffer, &ib, &jb, desc_trans, &ictxt);
得到的数据先进行转置,从而将列主序调整至行主序:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 if (pg->num_block_rows==pg->num_block_columns){ for (int i=0 ;i<trans_rows;i++){ for (int j=0 ;j<i;j++){ std ::swap(local_buffer[i*trans_cols+ j], local_buffer[j*trans_rows + i]); } } }else { double *transpose_buffer = (double *)calloc (trans_rows * trans_cols, sizeof (double )); for (int i=0 ;i<trans_rows;i++){ for (int j=0 ;j<trans_cols;j++){ transpose_buffer[i*trans_cols + j] = local_buffer[j*trans_rows+i]; } } free (local_buffer); local_buffer = transpose_buffer; }
此时得到的是稠密的完整的DB-SpGEMM可处理的矩阵块,将其进行稀疏性检查,将含有非零元的块加入矩阵当中:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 for (int i=0 ;i<pg->num_block_rows;i++){ for (int j=0 ;j<pg->num_block_columns;j++){ int block_offset = i*pg->num_block_columns*spgemm_block_size*spgemm_block_size + j*spgemm_block_size; bool nnz = false ; for (int k=0 ;k<spgemm_block_size&&!nnz;k++){ for (int l=0 ;l<spgemm_block_size&&!nnz;l++){ if (local_buffer[block_offset + k*trans_cols + l] != 0 ){ nnz = true ; break ; } } } if (nnz){ distblockmat.local_data[i][j].resize(spgemm_block_size*spgemm_block_size); for (int k=0 ;k<spgemm_block_size;k++){ memcpy (distblockmat.local_data[i][j].data() + k*spgemm_block_size, local_buffer + block_offset + k*trans_cols, spgemm_block_size*sizeof (double )); } } } }