lis is a parallel iterative solvers library for linear systems. Instead of read matrix from files, write matrix information directly in the code is useful. I tested it and gave an example here. The matrix is stored in csr form and distributed in all the cpus.

#include <stdio.h>
#include "lis_config.h"
#include "lis.h"

LIS_INT main(LIS_INT argc, char* argv[])
{
  LIS_MATRIX  A;
  LIS_VECTOR  b, x;
  LIS_SOLVER  solver;
  LIS_INT     iter;
  double      time;

  LIS_INT n,gn,nnz,my_rank,is,ie, i, j;
  LIS_INT *row, *col;
  LIS_SCALAR *value;

  lis_initialize(&argc, &argv);

  lis_matrix_create(LIS_COMM_WORLD, &A);
  lis_vector_create(LIS_COMM_WORLD, &b);
  lis_vector_create(LIS_COMM_WORLD, &x);
  gn = 6;
  lis_matrix_set_size(A,0,gn);
  lis_matrix_get_size(A,&n,&gn);
  lis_matrix_get_range(A,&is,&ie);

  i = 1; j = 2; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 0; j = 1; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,4.11418428571,A);
  i = 5; j = 4; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 0; j = 0; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,25.6,A);
  i = 3; j = 3; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-0.000345454479536,A);
  i = 3; j = 0; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 3; j = 4; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,0.000345454479536,A);
  i = 4; j = 4; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-0.000345454479536,A);
  i = 2; j = 1; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 2; j = 0; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 1; j = 4; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 4; j = 3; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,0.000345454479536,A);
  i = 1; j = 0; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,4.11418428571,A);
  i = 4; j = 5; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 0; j = 3; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 5; j = 3; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 4; j = 1; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 0; j = 2; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  i = 3; j = 5; if (i>=is && i < ie) lis_matrix_set_value(LIS_INS_VALUE,i,j,-1.0,A);
  lis_matrix_assemble(A);



  lis_vector_duplicate(A, &b);
  lis_vector_set_all(0.0, b);
  lis_vector_get_range(b,&is,&ie);
  if (is == 0) lis_vector_set_value(LIS_INS_VALUE, 0, 5.04, b);
  lis_output_vector(b, LIS_FMT_MM, argv[1]);

  lis_vector_duplicate(A, &x);

  lis_solver_create(&solver);
  lis_solver_set_optionC(solver);
  lis_solve(A, b, x, solver);
  lis_solver_get_iter(solver, &iter);
  lis_solver_get_time(solver, &time);
  printf("number of iterations = %d\n", iter);
  printf("elapsed time = %e\n", time);

  lis_output_vector(x, LIS_FMT_MM, argv[2]);

  lis_solver_destroy(solver);
  lis_matrix_destroy(A);
  lis_vector_destroy(b);
  lis_vector_destroy(x);

  lis_finalize();

  return 0;
}

And here is the Makefile.

CC=mpicc
#CFLAGS=-c -Wall
#LDFLAGS=
#CFLAGS=-c -Wall -I/nas02/home/y/i/yiy/software/lis/lis-1.4.61-build/include
CFLAGS=-DHAVE_CONFIG_H -I. -I/nas02/home/y/i/yiy/software/lis/lis-1.4.61/include  -I/nas02/home/y/i/yiy/software/lis/lis-1.4.61/   -O3 -ansi_alias  -DUSE_MPI -c
#LDFLAGS=-L/nas02/home/y/i/yiy/software/lis/lis-1.4.61-build/lib -llis
LDFLAGS=-L/nas02/home/y/i/yiy/software/lis/lis-1.4.61/src /nas02/home/y/i/yiy/software/lis/lis-1.4.61/src/.libs/liblis.a -lm
SOURCES=test_wiki.c
OBJECTS=$(SOURCES:.c=.o)
EXECUTABLE=test

all: $(SOURCES) $(EXECUTABLE)

$(EXECUTABLE): $(OBJECTS)
    $(CC) $(OBJECTS) $(LDFLAGS)  -o $@

.c.o:
    $(CC) $(CFLAGS) $< -o $@

clean:
    rm -rf *o test

I can run the exe in serial and parallel.

./test b x
mpiexec -n 2 ./test b x

However when the number of cpus is larger than the number of rows in A matrix the program will stuck. I am contacting lis developers to see if there is something wrong with the code.