/* ---------------------------------------------------------------------
 *
 * Copyright (C) 2009 - 2019 by the deal.II authors
 *
 * This file is part of the deal.II library.
 *
 * The deal.II library is free software; you can use it, redistribute
 * it, and/or modify it under the terms of the GNU Lesser General
 * Public License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * The full text of the license can be found in the file LICENSE.md at
 * the top level directory of deal.II.
 *
 * ---------------------------------------------------------------------
 *
 * Author: Wolfgang Bangerth, Texas A&M University, 2009, 2010
 *         Timo Heister, University of Goettingen, 2009, 2010
 */
#include <deal.II/base/quadrature_lib.h>
#include <deal.II/base/function.h>
#include <deal.II/base/timer.h>
#include <deal.II/lac/generic_linear_algebra.h>

#include <deal.II/base/time_stepping.h>

#include <deal.II/base/utilities.h>
#include <deal.II/base/conditional_ostream.h>
#include <deal.II/base/index_set.h>

namespace LA
{
#undef DEAL_II_WITH_PETSC
#if defined(DEAL_II_WITH_PETSC) && !defined(DEAL_II_PETSC_WITH_COMPLEX) && \
	!(defined(DEAL_II_WITH_TRILINOS) && defined(FORCE_USE_OF_TRILINOS))
	using namespace dealii::LinearAlgebraPETSc;
#  define USE_PETSC_LA
#elif defined(DEAL_II_WITH_TRILINOS)
	using namespace dealii::LinearAlgebraTrilinos;
#else
#  error DEAL_II_WITH_PETSC or DEAL_II_WITH_TRILINOS required
#endif
} // namespace LA
#include <deal.II/lac/vector.h>
#include <deal.II/lac/full_matrix.h>
#include <deal.II/lac/solver_cg.h>
#include <deal.II/lac/affine_constraints.h>
#include <deal.II/lac/dynamic_sparsity_pattern.h>
#include <deal.II/lac/sparsity_tools.h>
#include <deal.II/lac/linear_operator.h>
#include <deal.II/lac/parpack_solver.h>
#include <deal.II/lac/solver_gmres.h>
#include <deal.II/lac/sparse_direct.h>

#include <deal.II/grid/grid_generator.h>
#include <deal.II/grid/tria_accessor.h>
#include <deal.II/grid/tria_iterator.h>

#include <deal.II/dofs/dof_handler.h>
#include <deal.II/dofs/dof_accessor.h>
#include <deal.II/dofs/dof_tools.h>

#include <deal.II/fe/fe_values.h>
#include <deal.II/fe/fe_q.h>

#include <deal.II/numerics/vector_tools.h>
#include <deal.II/numerics/data_out.h>
#include <deal.II/numerics/error_estimator.h>

#include <deal.II/matrix_free/fe_evaluation.h>
#include <deal.II/matrix_free/matrix_free.h>
#include <deal.II/matrix_free/operators.h>

#include <deal.II/differentiation/ad.h>

#include <deal.II/distributed/tria.h>
#include <deal.II/distributed/grid_refinement.h>

#include <fstream>
#include <iostream>

constexpr double time_step = 0.1;

constexpr unsigned int fe_degree            = 2;
constexpr unsigned int n_q_points_1d        = fe_degree + 2;
using Number                                = double;

enum LowStorageRungeKuttaScheme
{
	stage_3_order_3, /* Kennedy, Carpenter, Lewis, 2000 */
	stage_5_order_4, /* Kennedy, Carpenter, Lewis, 2000 */
	stage_7_order_4, /* Tselios, Simos, 2007 */
	stage_9_order_5, /* Kennedy, Carpenter, Lewis, 2000 */
};
constexpr LowStorageRungeKuttaScheme lsrk_scheme = stage_5_order_4;

namespace Step40
{
	using namespace dealii;

	class LowStorageRungeKuttaIntegrator
	{
	public:
		LowStorageRungeKuttaIntegrator(const LowStorageRungeKuttaScheme scheme)
		{
			if (scheme == stage_3_order_3)
			{
				bi = {{0.245170287303492, 0.184896052186740, 0.569933660509768}};
				ai = {{0.755726351946097, 0.386954477304099}};
			}
			else if (scheme == stage_5_order_4)
			{
				bi = {{1153189308089. / 22510343858157.,
					   1772645290293. / 4653164025191.,
					   -1672844663538. / 4480602732383.,
					   2114624349019. / 3568978502595.,
					   5198255086312. / 14908931495163.}};
				ai = {{970286171893. / 4311952581923.,
					   6584761158862. / 12103376702013.,
					   2251764453980. / 15575788980749.,
					   26877169314380. / 34165994151039.}};
			}
			else if (scheme == stage_7_order_4)
			{
				bi = {{0.0941840925477795334,
					   0.149683694803496998,
					   0.285204742060440058,
					   -0.122201846148053668,
					   0.0605151571191401122,
					   0.345986987898399296,
					   0.186627171718797670}};
				ai = {{0.241566650129646868 + bi[0],
					   0.0423866513027719953 + bi[1],
					   0.215602732678803776 + bi[2],
					   0.232328007537583987 + bi[3],
					   0.256223412574146438 + bi[4],
					   0.0978694102142697230 + bi[5]}};
			}
			else if (scheme == stage_9_order_5)
			{
				bi = {{2274579626619. / 23610510767302.,
					   693987741272. / 12394497460941.,
					   -347131529483. / 15096185902911.,
					   1144057200723. / 32081666971178.,
					   1562491064753. / 11797114684756.,
					   13113619727965. / 44346030145118.,
					   393957816125. / 7825732611452.,
					   720647959663. / 6565743875477.,
					   3559252274877. / 14424734981077.}};
				ai = {{1107026461565. / 5417078080134.,
					   38141181049399. / 41724347789894.,
					   493273079041. / 11940823631197.,
					   1851571280403. / 6147804934346.,
					   11782306865191. / 62590030070788.,
					   9452544825720. / 13648368537481.,
					   4435885630781. / 26285702406235.,
					   2357909744247. / 11371140753790.}};
			}
			else
				AssertThrow(false, ExcNotImplemented());
		}

		unsigned int n_stages() const
		{
			return bi.size();
		}

		template <typename VectorType, typename Operator>
		void perform_time_step(const Operator &pde_operator,
							   const double    current_time,
							   const double    time_step,
							   VectorType &    solution,
							   VectorType &    vec_Ti,
							   VectorType &    vec_Ki)
		{
			AssertDimension(ai.size() + 1, bi.size());
			pde_operator.perform_stage(current_time,
									   bi[0] * time_step,
					ai[0] * time_step,
					solution,
					vec_Ti,
					solution,
					vec_Ti);
			double sum_previous_bi = 0;
			for (unsigned int stage = 1; stage < bi.size(); ++stage)
			{
				const double c_i = sum_previous_bi + ai[stage - 1];
				pde_operator.perform_stage(current_time + c_i * time_step,
										   bi[stage] * time_step,
										   (stage == bi.size() - 1 ?
												0 :
												ai[stage] * time_step),
										   vec_Ti,
										   vec_Ki,
										   solution,
										   vec_Ti);
				sum_previous_bi += bi[stage - 1];
			}
		}

	private:
		std::vector<double> bi;
		std::vector<double> ai;
	};

	template <int dim, int degree, int n_points_1d>
	class LaplaceOperator
	{
	public:
		static constexpr unsigned int n_quadrature_points_1d = n_points_1d;

		LaplaceOperator() : computing_times(6){

		};

		void reinit(const Mapping<dim> &   mapping,
					const DoFHandler<dim> &dof_handler,
					const AffineConstraints<double> &hanging_node_constraints);

		void print_computing_times() const;

		void apply(const double                                      current_time,
				   const LinearAlgebra::distributed::Vector<Number> &src,
				   LinearAlgebra::distributed::Vector<Number> &      dst) const;

		void
		perform_stage(const Number cur_time,
					  const Number factor_solution,
					  const Number factor_ai,
					  const LinearAlgebra::distributed::Vector<Number> &current_Ti,
					  LinearAlgebra::distributed::Vector<Number> &      vec_Ki,
					  LinearAlgebra::distributed::Vector<Number> &      solution,
					  LinearAlgebra::distributed::Vector<Number> &next_Ti) const;

		void project(const Function<dim> &                       function,
					 LinearAlgebra::distributed::Vector<Number> &solution) const;

		void
		initialize_vector(LinearAlgebra::distributed::Vector<Number> &vector) const;

	private:
		MatrixFree<dim, Number>     data;
		mutable std::vector<double> computing_times;

		LinearAlgebra::distributed::Vector<double> inv_mass_matrix;

		void local_apply_inverse_mass_matrix(
				const MatrixFree<dim, Number> &                   data,
				LinearAlgebra::distributed::Vector<Number> &      dst,
				const LinearAlgebra::distributed::Vector<Number> &src,
				const std::pair<unsigned int, unsigned int> &     cell_range) const;

		void local_apply_cell(
				const MatrixFree<dim, Number> &                   data,
				LinearAlgebra::distributed::Vector<Number> &      dst,
				const LinearAlgebra::distributed::Vector<Number> &src,
				const std::pair<unsigned int, unsigned int> &     cell_range) const;
	};

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::reinit(
			const Mapping<dim> &mapping,
			const DoFHandler<dim> &dof_handler,
			const AffineConstraints<double> &hanging_node_constraints){
		std::vector<const DoFHandler<dim> *>           dof_handlers({&dof_handler});
		AffineConstraints<double>                      dummy(hanging_node_constraints);
		std::vector<const AffineConstraints<double> *> constraints({&hanging_node_constraints});
		std::vector<Quadrature<1>>                     quadratures(
		{QGauss<1>(n_q_points_1d),
		 QGauss<1>(fe_degree + 1)});
		typename MatrixFree<dim, Number>::AdditionalData additional_data;
		additional_data.mapping_update_flags =
				(update_gradients | update_JxW_values | update_quadrature_points |
				 update_values);

		additional_data.tasks_parallel_scheme =
				MatrixFree<dim, Number>::AdditionalData::none;

		data.reinit(mapping,
					dof_handlers,
					constraints,
					quadratures,
					additional_data);

		data.initialize_dof_vector(inv_mass_matrix);
		FEEvaluation<dim, degree, n_points_1d, 1, Number> fe_eval(data);
		const unsigned int           n_q_points = fe_eval.n_q_points;
		for (unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
		{
			fe_eval.reinit(cell);
			for (unsigned int q = 0; q < n_q_points; ++q)
				fe_eval.submit_value(make_vectorized_array(1.), q);
			fe_eval.integrate(true, false);
			fe_eval.distribute_local_to_global(inv_mass_matrix);
		}
		inv_mass_matrix.compress(VectorOperation::add);
		for (unsigned int k = 0; k < inv_mass_matrix.local_size(); ++k)
			if (inv_mass_matrix.local_element(k) > 1e-15)
				inv_mass_matrix.local_element(k) =
						1. / inv_mass_matrix.local_element(k);
			else
				inv_mass_matrix.local_element(k) = 1;
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::initialize_vector(
			LinearAlgebra::distributed::Vector<Number> &vector) const
	{
		data.initialize_dof_vector(vector);
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::print_computing_times() const
	{
		ConditionalOStream        pcout(std::cout,
										Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) ==
										0);
		Utilities::MPI::MinMaxAvg data;
		if (computing_times[2] + computing_times[3] > 0)
		{
			std::ios_base::fmtflags flags(std::cout.flags());
			std::cout << std::setprecision(3);
			pcout << "   Apply called "
				  << static_cast<std::size_t>(computing_times[3]) << " times"
																  << std::endl;
			pcout << "   RK update called "
				  << static_cast<std::size_t>(computing_times[2]) << " times"
																  << std::endl;
			pcout << "   Evaluate L_h:            ";
			data = Utilities::MPI::min_max_avg(computing_times[0], MPI_COMM_WORLD);
			pcout << std::scientific << std::setw(10) << data.min << " (p"
				  << std::setw(4) << data.min_index << ") " << std::setw(10)
				  << data.avg << std::setw(10) << data.max << " (p" << std::setw(4)
				  << data.max_index << ")" << std::endl;
			pcout << "   Inverse mass (+RK upd):  ";
			data = Utilities::MPI::min_max_avg(computing_times[1], MPI_COMM_WORLD);
			pcout << std::scientific << std::setw(10) << data.min << " (p"
				  << std::setw(4) << data.min_index << ") " << std::setw(10)
				  << data.avg << std::setw(10) << data.max << " (p" << std::setw(4)
				  << data.max_index << ")" << std::endl;
			pcout << "   Compute transport speed: ";
			data = Utilities::MPI::min_max_avg(computing_times[4], MPI_COMM_WORLD);
			pcout << std::scientific << std::setw(10) << data.min << " (p"
				  << std::setw(4) << data.min_index << ") " << std::setw(10)
				  << data.avg << std::setw(10) << data.max << " (p" << std::setw(4)
				  << data.max_index << ")" << std::endl;
			pcout << "   Compute errors/norms:    ";
			data = Utilities::MPI::min_max_avg(computing_times[5], MPI_COMM_WORLD);
			pcout << std::scientific << std::setw(10) << data.min << " (p"
				  << std::setw(4) << data.min_index << ") " << std::setw(10)
				  << data.avg << std::setw(10) << data.max << " (p" << std::setw(4)
				  << data.max_index << ")" << std::endl;
			std::cout.flags(flags);
		}
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::local_apply_cell(
			const MatrixFree<dim, Number> &                   data,
			LinearAlgebra::distributed::Vector<Number> &      dst,
			const LinearAlgebra::distributed::Vector<Number> &src,
			const std::pair<unsigned int, unsigned int> &     cell_range) const
	{
		FEEvaluation<dim, degree, n_points_1d, 1, Number> phi(data);

		for (unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
		{
			phi.reinit(cell);
			phi.gather_evaluate(src, false, true);
			for (unsigned int q = 0; q < phi.n_q_points; ++q)
			{
				phi.submit_gradient(-1. * phi.get_gradient(q), q);
			}
			phi.integrate_scatter(false, true, dst);
		}
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::local_apply_inverse_mass_matrix(
			const MatrixFree<dim, Number> &                   data,
			LinearAlgebra::distributed::Vector<Number> &      dst,
			const LinearAlgebra::distributed::Vector<Number> &src,
			const std::pair<unsigned int, unsigned int> &     cell_range) const
	{
		dst.reinit(src);
		dst = src;
		dst.scale(inv_mass_matrix);
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::apply(
			const double                                      current_time,
			const LinearAlgebra::distributed::Vector<Number> &src,
			LinearAlgebra::distributed::Vector<Number> &      dst) const
	{
		Timer timer;

		data.cell_loop(&LaplaceOperator::local_apply_cell,
					   this,
					   dst,
					   src,
					   true);
		computing_times[0] += timer.wall_time();
		timer.restart();

		data.cell_loop(&LaplaceOperator::local_apply_inverse_mass_matrix,
					   this,
					   dst,
					   dst);
		computing_times[1] += timer.wall_time();

		computing_times[3] += 1.;
	}

	template <int dim, int degree, int n_points_1d>
	void LaplaceOperator<dim, degree, n_points_1d>::perform_stage(
			const Number                                      current_time,
			const Number                                      factor_solution,
			const Number                                      factor_ai,
			const LinearAlgebra::distributed::Vector<Number> &current_Ti,
			LinearAlgebra::distributed::Vector<Number> &      vec_Ki,
			LinearAlgebra::distributed::Vector<Number> &      solution,
			LinearAlgebra::distributed::Vector<Number> &      next_Ti) const
	{
		Timer timer;

		data.cell_loop(&LaplaceOperator::local_apply_cell,
					   this,
					   vec_Ki,
					   current_Ti,
					   true);
		computing_times[0] += timer.wall_time();

		timer.restart();

		data.cell_loop(&LaplaceOperator::local_apply_inverse_mass_matrix,
					   this,
					   next_Ti,
					   vec_Ki,
					   std::function<void(const unsigned int, const unsigned int)>(),
					   [&](const unsigned int start_range, const unsigned int end_range) {
			const Number ai = factor_ai;
			const Number bi = factor_solution;
			if (ai == Number())
			{
				DEAL_II_OPENMP_SIMD_PRAGMA
						for (unsigned int i = start_range; i < end_range; ++i)
				{
					const Number K_i          = next_Ti.local_element(i);
					const Number sol_i        = solution.local_element(i);
					solution.local_element(i) = sol_i + bi * K_i;
				}
			}
			else
			{
				DEAL_II_OPENMP_SIMD_PRAGMA
						for (unsigned int i = start_range; i < end_range; ++i)
				{
					const Number K_i          = next_Ti.local_element(i);
					const Number sol_i        = solution.local_element(i);
					solution.local_element(i) = sol_i + bi * K_i;
					next_Ti.local_element(i)  = sol_i + ai * K_i;
				}
			}
		});

		computing_times[1] += timer.wall_time();

		computing_times[2] += 1.;
	}


	template <int dim>
	class InitialCondition : public Function<dim>
	{
	public:
		InitialCondition() : Function<dim>(1){

		}
		virtual double value(const Point<dim> &p, const unsigned int component) const override;
		virtual Tensor<1, dim> gradient(const Point<dim> &p, const unsigned int component) const override;
	};

	template <int dim>
	double InitialCondition<dim>::value(const Point<dim> &p, const unsigned int) const{
		const double x = p[0];
		const double y = p[1];

		return sin(M_PI * x) * sin(M_PI * y);
	}

	template <int dim>
	Tensor<1, dim> InitialCondition<dim>::gradient(const Point<dim> &p, const unsigned int) const{
		const double x = p[0];
		const double y = p[1];

		Tensor<1, dim> return_value;
		return_value[0] = M_PI * cos(M_PI * x) * sin(M_PI * y);
		return_value[1] = M_PI * sin(M_PI * x) * cos(M_PI * y);

		return return_value;
	}

	template <int dim>
	class LaplaceProblem
	{
	public:
		LaplaceProblem();
		void run_with_MF();
	private:
		void setup_system();

		void output_results(const unsigned int cycle) const;

		MPI_Comm mpi_communicator;
		parallel::distributed::Triangulation<dim> triangulation;
		FE_Q<dim>       fe;
		MappingQGeneric<dim> mapping;
		DoFHandler<dim> dof_handler;
		AffineConstraints<double> constraints;
		LaplaceOperator<dim, fe_degree, n_q_points_1d> laplace_operator;
		LinearAlgebra::distributed::Vector<Number> locally_relevant_solution_MF;

		ConditionalOStream pcout;
		TimerOutput        computing_timer;
	};

	template <int dim>
	LaplaceProblem<dim>::LaplaceProblem()
		: mpi_communicator(MPI_COMM_WORLD)
		, triangulation(mpi_communicator,
						typename Triangulation<dim>::MeshSmoothing(
							Triangulation<dim>::smoothing_on_refinement |
							Triangulation<dim>::smoothing_on_coarsening))
		, fe(fe_degree)
		, mapping(fe.degree + 1)
		, dof_handler(triangulation)
		, pcout(std::cout,
				(Utilities::MPI::this_mpi_process(mpi_communicator) == 0))
		, computing_timer(mpi_communicator,
						  pcout,
						  TimerOutput::summary,
						  TimerOutput::wall_times)
	{}

	template <int dim>
	void LaplaceProblem<dim>::setup_system()
	{
		TimerOutput::Scope t(computing_timer, "setup");
		dof_handler.distribute_dofs(fe);

		IndexSet locally_relevant_dofs;
		DoFTools::extract_locally_relevant_dofs(dof_handler, locally_relevant_dofs);

		constraints.clear();
		constraints.reinit(locally_relevant_dofs);
		DoFTools::make_hanging_node_constraints(dof_handler, constraints);
		VectorTools::interpolate_boundary_values(dof_handler,
												 0,
												 Functions::ZeroFunction<dim>(),
												 constraints);
		constraints.close();

		laplace_operator.reinit(mapping, dof_handler, constraints);
		laplace_operator.initialize_vector(locally_relevant_solution_MF);
	}

	template <int dim>
	void LaplaceProblem<dim>::output_results(const unsigned int cycle) const
	{
		DataOut<dim> data_out;
		data_out.attach_dof_handler(dof_handler);
		data_out.add_data_vector(locally_relevant_solution_MF, "u_MF");
		Vector<float> subdomain(triangulation.n_active_cells());
		for (unsigned int i = 0; i < subdomain.size(); ++i)
			subdomain(i) = triangulation.locally_owned_subdomain();
		data_out.add_data_vector(subdomain, "subdomain");
		data_out.build_patches();

		data_out.write_vtu_with_pvtu_record("", "solution", cycle);
	}

	template <int dim>
	void LaplaceProblem<dim>::run_with_MF(){
		{
			const unsigned int n_vect_number =
					VectorizedArray<Number>::n_array_elements;
			const unsigned int n_vect_bits = 8 * sizeof(Number) * n_vect_number;

			pcout << "Running with "
				  << Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD)
				  << " MPI processes" << std::endl;
			pcout << "Vectorization over " << n_vect_number << " "
				  << (std::is_same<Number, double>::value ? "doubles" : "floats")
				  << " = " << n_vect_bits << " bits ("
				  << Utilities::System::get_current_vectorization_level()
				  << "), VECTORIZATION_LEVEL=" << DEAL_II_COMPILER_VECTORIZATION_LEVEL
				  << std::endl;
		}

		const unsigned int n_cycles = 8;

		double local_time = 0., local_end_time = 0.;

		GridGenerator::hyper_cube(triangulation);
		triangulation.refine_global(5);

		setup_system();

		for (unsigned int cycle = 0; cycle < n_cycles; ++cycle)
		{
			local_time = cycle * time_step;
			local_end_time = (cycle + 1) * time_step;
			pcout << "Cycle " << cycle << ':' << std::endl;

			//triangulation.refine_global(1);


			if(cycle == 0){

				LinearAlgebra::distributed::Vector<Number> local_solution;
				laplace_operator.initialize_vector(local_solution);

				VectorTools::project(dof_handler,
									 constraints,
									 QGauss<dim>(fe.degree + 1),
									 InitialCondition<dim>(),
									 local_solution);

				locally_relevant_solution_MF = local_solution;
			}

			LowStorageRungeKuttaIntegrator integrator(lsrk_scheme);

			LinearAlgebra::distributed::Vector<Number> rk_register_1;
			LinearAlgebra::distributed::Vector<Number> rk_register_2;
			rk_register_1.reinit(locally_relevant_solution_MF);
			rk_register_2.reinit(locally_relevant_solution_MF);

			double local_time_step = 0.01;

			//			if (Utilities::MPI::n_mpi_processes(mpi_communicator) <= 32 && cycle == 0)
			//			{
			//				TimerOutput::Scope t(computing_timer, "output");
			//				output_results(cycle);
			//			}

			pcout << "   Number of active cells:       "
				  << triangulation.n_global_active_cells() << std::endl
				  << "   Number of degrees of freedom: " << dof_handler.n_dofs()
				  << std::endl;

			Timer        timer;

			double       wtime           = 0;
			double       output_time     = 0;
			unsigned int timestep_number = 1;

			while (local_time < (local_end_time - 1e-12))
			{
				timer.restart();
				++timestep_number;

				integrator.perform_time_step(laplace_operator,
											 local_time,
											 local_time_step,
											 locally_relevant_solution_MF,
											 rk_register_1,
											 rk_register_2);

				constraints.distribute(locally_relevant_solution_MF);

				local_time += local_time_step;

				wtime += timer.wall_time();

				timer.restart();

				//				if (static_cast<int>(time / output_tick) !=
				//						static_cast<int>((time - local_time_step) / output_tick) ||
				//						time >= time_step - 1e-12)
				//					output_results(
				//								static_cast<unsigned int>(std::round(time / output_tick)));
				output_time += timer.wall_time();
			}

			pcout << "Number of time steps: " << timestep_number << '\n';
			timestep_number = 1;
			if (Utilities::MPI::n_mpi_processes(mpi_communicator) <= 32)
			{
				TimerOutput::Scope t(computing_timer, "output");
				output_results(cycle + 1);
			}
			computing_timer.print_summary();
			computing_timer.reset();
			pcout << std::endl;
		}

		local_time = 0.;
	}
} // namespace Step40
int main(int argc, char *argv[])
{
	std::cout << "Hello World\n";
	try
	{
		using namespace dealii;
		using namespace Step40;
#ifdef DEAL_II_WITH_PETSC
		Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
#else
		Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, dealii::numbers::invalid_unsigned_int);
#endif
		LaplaceProblem<2> laplace_problem_2d;
		//laplace_problem_2d.run();
		laplace_problem_2d.run_with_MF();
	}
	catch (std::exception &exc)
	{
		std::cerr << std::endl
				  << std::endl
				  << "----------------------------------------------------"
				  << std::endl;
		std::cerr << "Exception on processing: " << std::endl
				  << exc.what() << std::endl
				  << "Aborting!" << std::endl
				  << "----------------------------------------------------"
				  << std::endl;
		return 1;
	}
	catch (...)
	{
		std::cerr << std::endl
				  << std::endl
				  << "----------------------------------------------------"
				  << std::endl;
		std::cerr << "Unknown exception!" << std::endl
				  << "Aborting!" << std::endl
				  << "----------------------------------------------------"
				  << std::endl;
		return 1;
	}
	return 0;
}
