diff -uNr espresso-5.0.2/Modules/mp.f90 espresso-5.0.2-GPU/Modules/mp.f90
--- espresso-5.0.2/Modules/mp.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/Modules/mp.f90	2013-05-31 14:24:52.054367809 +0100
@@ -159,6 +159,10 @@
         IF (ierr/=0) CALL mp_stop( 8006 )
 #  endif
 
+#if defined(__CUDA) || defined(__PHIGEMM )
+        CALL InitCudaEnv()
+#endif
+
         RETURN
       END SUBROUTINE mp_start
 !
@@ -172,6 +176,10 @@
         ierr = 0
         taskid = 0
 
+#if defined(__CUDA) || defined(__PHIGEMM )
+		CALL CloseCudaEnv()
+#endif
+
 #if defined __HPM
 
         !   terminate the IBM Harware performance monitor
diff -uNr espresso-5.0.2/PW/src/addusdens.f90 espresso-5.0.2-GPU/PW/src/addusdens.f90
--- espresso-5.0.2/PW/src/addusdens.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/addusdens.f90	2013-05-31 14:30:03.720305082 +0100
@@ -1,5 +1,5 @@
 !
-! Copyright (C) 2001-2006 Quantum ESPRESSO group
+! Copyright (C) 2001-2013 Quantum ESPRESSO group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@@ -24,7 +24,11 @@
   IF ( tqr ) THEN
      CALL addusdens_r(rho,.true.)
   ELSE
+#if defined(__CUDA) && !defined(__DISABLE_CUDA_ADDUSDENS)
+     CALL addusdens_g_gpu(rho)
+#else
      CALL addusdens_g(rho)
+#endif
   END IF
   !
   RETURN
diff -uNr espresso-5.0.2/PW/src/cdiaghg.f90 espresso-5.0.2-GPU/PW/src/cdiaghg.f90
--- espresso-5.0.2/PW/src/cdiaghg.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/cdiaghg.f90	2013-05-31 14:29:38.584367860 +0100
@@ -1,5 +1,5 @@
 !
-! Copyright (C) 2001-2006 Quantum ESPRESSO group
+! Copyright (C) 2001-2013 Quantum ESPRESSO group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@@ -17,6 +17,33 @@
   ! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
   ! ... On output both matrix are unchanged
   !
+  USE kinds,            ONLY : DP
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: n, m, ldh
+  COMPLEX(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
+  REAL(DP), INTENT(OUT) :: e(n)
+  COMPLEX(DP), INTENT(OUT) :: v(ldh,m)
+  !
+#if defined(__CUDA) && defined(__MAGMA)
+  CALL cdiaghg_gpu( n, m, h, s, ldh, e, v )
+#else
+  CALL cdiaghg_compute( n, m, h, s, ldh, e, v )
+#endif
+  !
+  RETURN
+  !
+END SUBROUTINE cdiaghg
+
+!----------------------------------------------------------------------------
+SUBROUTINE cdiaghg_compute( n, m, h, s, ldh, e, v )
+  !----------------------------------------------------------------------------
+  !
+  ! ... calculates eigenvalues and eigenvectors of the generalized problem
+  ! ... Hv=eSv, with H hermitean matrix, S overlap matrix.
+  ! ... On output both matrix are unchanged
+  !
   ! ... LAPACK version - uses both ZHEGV and ZHEGVX
   !
   USE kinds,            ONLY : DP
@@ -187,7 +214,7 @@
   !
   RETURN
   !
-END SUBROUTINE cdiaghg
+END SUBROUTINE cdiaghg_compute
 !
 !----------------------------------------------------------------------------
 SUBROUTINE pcdiaghg( n, h, s, ldh, e, v, desc )
diff -uNr espresso-5.0.2/PW/src/newd.f90 espresso-5.0.2-GPU/PW/src/newd.f90
--- espresso-5.0.2/PW/src/newd.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/newd.f90	2013-05-31 14:28:51.751391724 +0100
@@ -1,5 +1,5 @@
 !
-! Copyright (C) 2001-2010 Quantum ESPRESSO group
+! Copyright (C) 2001-2013 Quantum ESPRESSO group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@@ -10,12 +10,41 @@
 
 CONTAINS
 !---------------------------------------
+
 SUBROUTINE newq(vr,deeq,skip_vltot) 
   !
   !   This routine computes the integral of the perturbed potential with
   !   the Q function 
   !
   USE kinds,                ONLY : DP
+  USE fft_base,             ONLY : dfftp
+  USE ions_base,            ONLY : nat
+  USE lsda_mod,             ONLY : nspin
+  USE uspp_param,           ONLY : nhm
+  !
+  IMPLICIT NONE
+  !
+  ! Input: potential , output: contribution to integral
+  REAL(kind=dp), intent(in)  :: vr(dfftp%nnr,nspin)
+  REAL(kind=dp), intent(inout) :: deeq( nhm, nhm, nat, nspin )
+  LOGICAL, intent(in) :: skip_vltot
+  !
+#if defined(__CUDA) && !defined(__DISABLE_CUDA_NEWD)
+  CALL newq_compute_gpu(vr,deeq,skip_vltot)
+#else
+  CALL newq_compute(vr,deeq,skip_vltot)
+#endif
+  !
+  RETURN
+
+END SUBROUTINE newq
+
+SUBROUTINE newq_compute(vr,deeq,skip_vltot)
+  !
+  !   This routine computes the integral of the perturbed potential with
+  !   the Q function
+  !
+  USE kinds,                ONLY : DP
   USE ions_base,            ONLY : nat, ntyp => nsp, ityp
   USE cell_base,            ONLY : omega
   USE fft_base,             ONLY : dfftp
@@ -176,7 +205,7 @@
   !
   DEALLOCATE( aux, qgm, qmod, ylmk0 )
   !
-END SUBROUTINE newq
+END SUBROUTINE newq_compute
 !---------------------------------------
 SUBROUTINE newd()
   USE uspp,          ONLY : deeq
diff -uNr espresso-5.0.2/PW/src/pwscf.f90 espresso-5.0.2-GPU/PW/src/pwscf.f90
--- espresso-5.0.2/PW/src/pwscf.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/pwscf.f90	2013-05-31 14:25:31.013375854 +0100
@@ -36,13 +36,13 @@
   !
   CHARACTER(len=256) :: dirname
   !
-#ifdef __MPI
-  !
+
   CALL mp_startup ( )
   ! reset IO nodes
   ! (do this to make each "image head node" an ionode)
   ! Has to be used ONLY to run nimage copies of pwscf
   !
+#ifdef __MPI
   IF ( nimage > 1 ) CALL io_image_start( )
 #endif
   CALL environment_start ( 'PWSCF' )
diff -uNr espresso-5.0.2/PW/src/rdiaghg.f90 espresso-5.0.2-GPU/PW/src/rdiaghg.f90
--- espresso-5.0.2/PW/src/rdiaghg.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/rdiaghg.f90	2013-05-31 14:27:59.078331418 +0100
@@ -1,5 +1,5 @@
 !
-! Copyright (C) 2003-2006 Quantum ESPRESSO group
+! Copyright (C) 2003-2013 Quantum ESPRESSO group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@@ -14,6 +14,33 @@
   ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
   ! ... On output both matrix are unchanged
   !
+  USE kinds,            ONLY : DP
+  !
+  IMPLICIT NONE
+  !
+  INTEGER, INTENT(IN) :: n, m, ldh
+  REAL(DP), INTENT(INOUT) :: h(ldh,n), s(ldh,n)
+  REAL(DP), INTENT(OUT) :: e(n)
+  REAL(DP), INTENT(OUT) :: v(ldh,m)
+  !
+#if defined(__CUDA) && defined(__MAGMA)
+  CALL rdiaghg_gpu( n, m, h, s, ldh, e, v )
+#else
+  CALL rdiaghg_compute( n, m, h, s, ldh, e, v )
+#endif
+  !
+  RETURN
+  !
+END SUBROUTINE rdiaghg
+
+!----------------------------------------------------------------------------
+SUBROUTINE rdiaghg_compute( n, m, h, s, ldh, e, v )
+  !----------------------------------------------------------------------------
+  !
+  ! ... calculates eigenvalues and eigenvectors of the generalized problem
+  ! ... Hv=eSv, with H symmetric matrix, S overlap matrix.
+  ! ... On output both matrix are unchanged
+  !
   ! ... LAPACK version - uses both DSYGV and DSYGVX
   !
   USE kinds,            ONLY : DP
@@ -169,7 +196,7 @@
   !
   RETURN
   !
-END SUBROUTINE rdiaghg
+END SUBROUTINE rdiaghg_compute
 !
 !----------------------------------------------------------------------------
 SUBROUTINE prdiaghg( n, h, s, ldh, e, v, desc )
diff -uNr espresso-5.0.2/PW/src/vloc_psi.f90 espresso-5.0.2-GPU/PW/src/vloc_psi.f90
--- espresso-5.0.2/PW/src/vloc_psi.f90	2013-05-31 14:19:32.000000000 +0100
+++ espresso-5.0.2-GPU/PW/src/vloc_psi.f90	2013-05-31 14:27:27.759367967 +0100
@@ -1,5 +1,5 @@
 !
-! Copyright (C) 2003-2009 PWSCF group
+! Copyright (C) 2003-2013 PWSCF group
 ! This file is distributed under the terms of the
 ! GNU General Public License. See the file `License'
 ! in the root directory of the present distribution,
@@ -37,6 +37,10 @@
   COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
   INTEGER :: v_siz, idx, ioff
   !
+#if (defined(__CUDA) && !defined(__DISABLE_CUDA_VLOCPSI) && !defined(__PARA)) || (defined(__CUDA) && !defined(__DISABLE_CUDA_VLOCPSI) && defined(__PARA) && defined(__USE_3D_FFT))
+  CALL vloc_psi_gamma_gpu ( lda, n, m, psi, v, hpsi )
+  RETURN
+#endif
   !
   incr = 2
   !
@@ -222,6 +226,10 @@
   COMPLEX(DP), ALLOCATABLE :: tg_psic(:)
   INTEGER :: v_siz, idx, ioff
   !
+#if (defined(__CUDA) && !defined(__DISABLE_CUDA_VLOCPSI) && !defined(__PARA)) || (defined(__CUDA) && !defined(__DISABLE_CUDA_VLOCPSI) && defined(__PARA) && defined(__USE_3D_FFT))
+  CALL vloc_psi_k_gpu ( lda, n, m, psi, v, hpsi )
+  RETURN
+#endif
   !
   ! The following is dirty trick to prevent usage of task groups if
   ! the number of bands is smaller than the number of task groups