commit f848d72f06b82197af13609dc5a16c50fadabafb
Author: RonaldsonBellande <ronaldsonbellande@gmail.com>
Date:   Fri Jan 31 01:29:14 2025 -0500

    foundation

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
new file mode 100644
index 0000000..9fd45e0
--- /dev/null
+++ b/.github/workflows/rust.yml
@@ -0,0 +1,22 @@
+name: Rust
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Build
+      run: cargo build --verbose
+    - name: Run tests
+      run: cargo test --verbose
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a9d37c5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+target
+Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f697a44
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,40 @@
+[package]
+name = "bellande_artificial_intelligence_training_framework"
+version = "0.0.1"
+authors = ["Ronaldson Bellande"]
+edition = "2021"
+description = "An AI Training Framework developed by Bellande AI Research"
+license = "GPL-3.0-or-later"
+repository = "https://github.com/Artificial-Intelligence-Computer-Vision/bellande_artificial_intelligence_training_framework"
+
+[lib]
+name = "bellande_artificial_intelligence_training_framework"
+path = "src/bellande_artificial_intelligence_training_framework.rs"
+
+[dependencies]
+# Core dependencies
+ndarray = "0.15"
+rand = "0.8"
+rand_distr = "0.4"
+
+# Serialization
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+serde_yaml = "0.9"
+
+# Utilities
+num_cpus = "1.13"
+rayon = "1.5"
+parking_lot = "0.12"
+
+# Visualization
+plotters = "0.3"
+imageproc = "0.23"
+
+# Optional CUDA support
+cudarc = { version = "0.9", optional = true }
+
+glob = "0.3.1"
+bincode = "1.3.3"
+
+sys-info = "0.9"
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f288702
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..819ab03
--- /dev/null
+++ b/README.md
@@ -0,0 +1,45 @@
+# Bellande Artificial Intelligence Training Framework
+
+Bellande training framework in Rust for machine learning models
+
+# Run Bellos Scripts
+    - build_bellande_framework.bellos
+    - make_rust_executable.bellos
+    - run_bellande_framework.bellos
+
+# Run Bash Scripts
+    - build_bellande_framework.sh
+    - make_rust_executable.sh
+    - run_bellande_framework.sh
+
+# Testing
+- "cargo test" for a quick test
+
+## Example Usage
+```rust
+use bellande_ai_training_framework::prelude::*;
+
+fn main() -> Result<(), Box<dyn Error>> {
+    let mut framework = Framework::new()?;
+    framework.initialize()?;
+
+    // Create model
+    let model = Sequential::new()
+        .add(Conv2d::new(3, 64, 3, 1, 1))
+        .add(ReLU::new())
+        .add(Linear::new(64, 10));
+
+    // Configure training
+    let optimizer = Adam::new(model.parameters(), 0.001);
+    let loss_fn = CrossEntropyLoss::new();
+    let trainer = Trainer::new(model, optimizer, loss_fn);
+
+    // Train model
+    trainer.fit(train_loader, Some(val_loader), 100)?;
+
+    Ok(())
+}
+```
+
+## License
+Bellande Artificial Intelligence Training Framework is distributed under the [GNU General Public License v3.0](https://www.gnu.org/licenses/gpl-3.0.en.html), see [LICENSE](https://github.com/Artificial-Intelligence-Computer-Vision/bellande_artificial_intelligence_training_framework/blob/main/LICENSE) and [NOTICE](https://github.com/Artificial-Intelligence-Computer-Vision/bellande_artificial_intelligence_training_framework/blob/main/LICENSE) for more information.
diff --git a/dependencies.bellande b/dependencies.bellande
new file mode 100644
index 0000000..e69de29
diff --git a/git_scripts/.gitignore b/git_scripts/.gitignore
new file mode 100644
index 0000000..e5a7a9c
--- /dev/null
+++ b/git_scripts/.gitignore
@@ -0,0 +1,3 @@
+fix_errors.sh
+push.sh
+repository_recal.sh
diff --git a/scripts/bash/build_bellande_framework.sh b/scripts/bash/build_bellande_framework.sh
new file mode 100755
index 0000000..8ebfb62
--- /dev/null
+++ b/scripts/bash/build_bellande_framework.sh
@@ -0,0 +1 @@
+cargo build
diff --git a/scripts/bash/make_rust_executable.sh b/scripts/bash/make_rust_executable.sh
new file mode 100755
index 0000000..e327563
--- /dev/null
+++ b/scripts/bash/make_rust_executable.sh
@@ -0,0 +1 @@
+bellande_rust_executable -d ../../dependencies.bellande -s ../../src -m bellande_artificial_intelligence_training_framework.rs -o ../../executable/bellande_artificial_intelligence_training_framework
diff --git a/scripts/bash/run_bellande_framework.sh b/scripts/bash/run_bellande_framework.sh
new file mode 100755
index 0000000..535db6c
--- /dev/null
+++ b/scripts/bash/run_bellande_framework.sh
@@ -0,0 +1 @@
+bellande_rust_executable -d ../dependencies.bellande -s ../src -m bellande_artificial_intelligence_training_framework.rs -o ../executable/bellande_artificial_intelligence_training_framework
diff --git a/scripts/bellos/build_bellande_framework.bellos b/scripts/bellos/build_bellande_framework.bellos
new file mode 100755
index 0000000..8ebfb62
--- /dev/null
+++ b/scripts/bellos/build_bellande_framework.bellos
@@ -0,0 +1 @@
+cargo build
diff --git a/scripts/bellos/make_rust_executable.bellos b/scripts/bellos/make_rust_executable.bellos
new file mode 100755
index 0000000..e327563
--- /dev/null
+++ b/scripts/bellos/make_rust_executable.bellos
@@ -0,0 +1 @@
+bellande_rust_executable -d ../../dependencies.bellande -s ../../src -m bellande_artificial_intelligence_training_framework.rs -o ../../executable/bellande_artificial_intelligence_training_framework
diff --git a/scripts/bellos/run_bellande_framework.bellos b/scripts/bellos/run_bellande_framework.bellos
new file mode 100755
index 0000000..535db6c
--- /dev/null
+++ b/scripts/bellos/run_bellande_framework.bellos
@@ -0,0 +1 @@
+bellande_rust_executable -d ../dependencies.bellande -s ../src -m bellande_artificial_intelligence_training_framework.rs -o ../executable/bellande_artificial_intelligence_training_framework
diff --git a/src/bellande_artificial_intelligence_training_framework.rs b/src/bellande_artificial_intelligence_training_framework.rs
new file mode 100644
index 0000000..1bab39f
--- /dev/null
+++ b/src/bellande_artificial_intelligence_training_framework.rs
@@ -0,0 +1,113 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, error::BellandeError};
+use std::error::Error;
+use std::path::Path;
+
+pub mod core;
+pub mod data;
+pub mod layer;
+pub mod loss;
+pub mod metrics;
+pub mod models;
+pub mod optim;
+pub mod training;
+pub mod utilities;
+
+const VERSION: &str = env!("CARGO_PKG_VERSION");
+const FRAMEWORK_NAME: &str = "Bellande AI Training Framework";
+
+pub struct Framework {
+    config: utilities::config::Configuration,
+    device: Device,
+    initialized: bool,
+}
+
+impl Framework {
+    pub fn new() -> Result<Self, Box<dyn Error>> {
+        let default_config = utilities::config::Configuration::default();
+        Ok(Framework {
+            config: default_config,
+            device: Device::CPU,
+            initialized: false,
+        })
+    }
+
+    pub fn with_config<P: AsRef<Path>>(config_path: P) -> Result<Self, Box<dyn Error>> {
+        let config = utilities::config::Configuration::from_file(config_path)?;
+        let device = Device::from(&config.system.device)?;
+
+        Ok(Framework {
+            config,
+            device,
+            initialized: false,
+        })
+    }
+
+    pub fn initialize(&mut self) -> Result<(), Box<dyn Error>> {
+        if self.initialized {
+            return Ok(());
+        }
+
+        // Set random seed if specified
+        if let Some(seed) = self.config.system.seed {
+            core::random::set_seed(seed);
+        }
+
+        // Initialize CUDA if available and requested
+        if self.device.is_cuda() {
+            #[cfg(feature = "cuda")]
+            {
+                if Device::cuda_device_count() == 0 {
+                    return Err(Box::new(BellandeError::DeviceNotAvailable));
+                }
+            }
+            #[cfg(not(feature = "cuda"))]
+            {
+                return Err(Box::new(BellandeError::NotImplemented(
+                    "CUDA support not compiled".into(),
+                )));
+            }
+        }
+
+        self.initialized = true;
+        Ok(())
+    }
+
+    pub fn get_version() -> &'static str {
+        VERSION
+    }
+
+    pub fn get_name() -> &'static str {
+        FRAMEWORK_NAME
+    }
+
+    pub fn system_info() -> String {
+        format!(
+            "{} v{}\n\
+            CPU Threads: {}\n\
+            CUDA Available: {}\n\
+            CUDA Devices: {}\n\
+            Default Device: {}",
+            FRAMEWORK_NAME,
+            VERSION,
+            num_cpus::get(),
+            cfg!(feature = "cuda"),
+            Device::cuda_device_count(),
+            Device::default(),
+        )
+    }
+}
diff --git a/src/core/autograd.rs b/src/core/autograd.rs
new file mode 100644
index 0000000..976431c
--- /dev/null
+++ b/src/core/autograd.rs
@@ -0,0 +1,92 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::error::BellandeError;
+use crate::core::tensor::Tensor;
+use std::fmt;
+use std::sync::Arc;
+
+// Make AutogradFunction require Debug
+pub trait AutogradFunction: Send + Sync + fmt::Debug {
+    fn forward(&self, inputs: &[&Tensor]) -> Result<Tensor, BellandeError>;
+    fn backward(&self, grad_output: &Tensor) -> Result<Vec<Tensor>, BellandeError>;
+}
+
+pub struct AutogradContext {
+    saved_tensors: Vec<Tensor>,
+    needs_input_grad: Vec<bool>,
+}
+
+impl AutogradContext {
+    pub fn new(needs_input_grad: Vec<bool>) -> Self {
+        AutogradContext {
+            saved_tensors: Vec::new(),
+            needs_input_grad,
+        }
+    }
+
+    pub fn save_for_backward(&mut self, tensor: Tensor) {
+        self.saved_tensors.push(tensor);
+    }
+
+    pub fn get_saved_tensors(&self) -> &[Tensor] {
+        &self.saved_tensors
+    }
+}
+
+// Add Debug implementations for functions
+#[derive(Debug)]
+pub struct AddFunction;
+
+#[derive(Debug)]
+pub struct MulFunction;
+
+#[derive(Debug)]
+pub struct MatMulFunction;
+
+impl AutogradFunction for AddFunction {
+    fn forward(&self, inputs: &[&Tensor]) -> Result<Tensor, BellandeError> {
+        if inputs.len() != 2 {
+            return Err(BellandeError::InvalidInputs(
+                "Add operation requires exactly 2 input tensors".to_string(),
+            ));
+        }
+
+        let a = inputs[0];
+        let b = inputs[1];
+        if a.shape != b.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut result_data = Vec::with_capacity(a.data.len());
+        for i in 0..a.data.len() {
+            result_data.push(a.data[i] + b.data[i]);
+        }
+
+        Ok(Tensor {
+            data: result_data,
+            shape: a.shape.clone(),
+            requires_grad: a.requires_grad || b.requires_grad,
+            grad: None,
+            grad_fn: Some(Arc::new(AddFunction)),
+            device: a.device.clone(),
+            dtype: a.dtype,
+        })
+    }
+
+    fn backward(&self, grad_output: &Tensor) -> Result<Vec<Tensor>, BellandeError> {
+        Ok(vec![grad_output.clone(), grad_output.clone()])
+    }
+}
diff --git a/src/core/device.rs b/src/core/device.rs
new file mode 100644
index 0000000..6645958
--- /dev/null
+++ b/src/core/device.rs
@@ -0,0 +1,162 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::error::BellandeError;
+use std::str::FromStr;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum Device {
+    CPU,
+    CUDA(usize),
+}
+
+impl Device {
+    pub fn is_cuda(&self) -> bool {
+        matches!(self, Device::CUDA(_))
+    }
+
+    pub fn is_cpu(&self) -> bool {
+        matches!(self, Device::CPU)
+    }
+
+    pub fn cuda_device_count() -> usize {
+        #[cfg(feature = "cuda")]
+        {
+            match cuda_runtime::device_count() {
+                Ok(count) => count,
+                Err(_) => 0,
+            }
+        }
+        #[cfg(not(feature = "cuda"))]
+        0
+    }
+
+    pub fn default() -> Self {
+        Device::CPU
+    }
+
+    pub fn from(device_str: &str) -> Result<Self, BellandeError> {
+        Self::from_str(device_str)
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn get_cuda_properties(
+        device_id: usize,
+    ) -> Result<cuda_runtime::DeviceProperties, BellandeError> {
+        cuda_runtime::get_device_properties(device_id)
+            .map_err(|_| BellandeError::DeviceNotAvailable)
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn set_cuda_device(device_id: usize) -> Result<(), BellandeError> {
+        cuda_runtime::set_device(device_id).map_err(|_| BellandeError::DeviceNotAvailable)
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn get_current_cuda_device() -> Result<usize, BellandeError> {
+        cuda_runtime::get_device().map_err(|_| BellandeError::DeviceNotAvailable)
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn reset_cuda_device() -> Result<(), BellandeError> {
+        cuda_runtime::device_reset().map_err(|_| BellandeError::DeviceNotAvailable)
+    }
+
+    pub fn get_device_name(&self) -> Result<String, BellandeError> {
+        match self {
+            Device::CPU => Ok("CPU".to_string()),
+            Device::CUDA(device_id) => {
+                #[cfg(feature = "cuda")]
+                {
+                    let props = Self::get_cuda_properties(*device_id)?;
+                    Ok(props.name)
+                }
+                #[cfg(not(feature = "cuda"))]
+                Err(BellandeError::NotImplemented(
+                    "CUDA support not compiled".into(),
+                ))
+            }
+        }
+    }
+
+    pub fn get_device_memory(&self) -> Result<usize, BellandeError> {
+        match self {
+            Device::CPU => {
+                let sys_info = sys_info::mem_info().map_err(|_| {
+                    BellandeError::SystemError("Failed to get system memory info".into())
+                })?;
+                Ok(sys_info.total as usize)
+            }
+            Device::CUDA(device_id) => {
+                #[cfg(feature = "cuda")]
+                {
+                    let props = Self::get_cuda_properties(*device_id)?;
+                    Ok(props.total_global_mem)
+                }
+                #[cfg(not(feature = "cuda"))]
+                Err(BellandeError::NotImplemented(
+                    "CUDA support not compiled".into(),
+                ))
+            }
+        }
+    }
+}
+
+impl FromStr for Device {
+    type Err = BellandeError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let s = s.to_lowercase();
+        if s == "cpu" {
+            Ok(Device::CPU)
+        } else if s.starts_with("cuda") {
+            if s == "cuda" {
+                Ok(Device::CUDA(0))
+            } else {
+                let parts: Vec<&str> = s.split(':').collect();
+                if parts.len() != 2 {
+                    return Err(BellandeError::InvalidDevice);
+                }
+                match parts[1].parse::<usize>() {
+                    Ok(device_id) => {
+                        if device_id < Self::cuda_device_count() {
+                            Ok(Device::CUDA(device_id))
+                        } else {
+                            Err(BellandeError::DeviceNotAvailable)
+                        }
+                    }
+                    Err(_) => Err(BellandeError::InvalidDevice),
+                }
+            }
+        } else {
+            Err(BellandeError::InvalidDevice)
+        }
+    }
+}
+
+impl std::fmt::Display for Device {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Device::CPU => write!(f, "cpu"),
+            Device::CUDA(device_id) => write!(f, "cuda:{}", device_id),
+        }
+    }
+}
+
+impl Default for Device {
+    fn default() -> Self {
+        Self::CPU
+    }
+}
diff --git a/src/core/dtype.rs b/src/core/dtype.rs
new file mode 100644
index 0000000..a2e5651
--- /dev/null
+++ b/src/core/dtype.rs
@@ -0,0 +1,55 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum DataType {
+    Float32,
+    Float64,
+    Int32,
+    Int64,
+    Bool,
+}
+
+impl DataType {
+    pub fn size_in_bytes(&self) -> usize {
+        match self {
+            DataType::Float32 => 4,
+            DataType::Float64 => 8,
+            DataType::Int32 => 4,
+            DataType::Int64 => 8,
+            DataType::Bool => 1,
+        }
+    }
+
+    pub fn is_floating_point(&self) -> bool {
+        matches!(self, DataType::Float32 | DataType::Float64)
+    }
+
+    pub fn default() -> Self {
+        DataType::Float32
+    }
+}
+
+impl std::fmt::Display for DataType {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            DataType::Float32 => write!(f, "float32"),
+            DataType::Float64 => write!(f, "float64"),
+            DataType::Int32 => write!(f, "int32"),
+            DataType::Int64 => write!(f, "int64"),
+            DataType::Bool => write!(f, "bool"),
+        }
+    }
+}
diff --git a/src/core/error.rs b/src/core/error.rs
new file mode 100644
index 0000000..8af21f9
--- /dev/null
+++ b/src/core/error.rs
@@ -0,0 +1,74 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::error::Error;
+use std::fmt;
+
+#[derive(Debug)]
+pub enum BellandeError {
+    NoGradients,
+    InvalidShape(String),
+    DimensionMismatch,
+    InvalidBackward(String),
+    DeviceNotAvailable,
+    InvalidDevice,
+    SerializationError(String),
+    InvalidDataType,
+    InvalidInputs(String),
+    IndexOutOfBounds,
+    LockError,
+    CUDAError(String),
+    IOError(String),
+    RuntimeError(String),
+    ImageError(String),
+    InvalidOperation(String),
+    InvalidConfiguration(String),
+    NotImplemented(String),
+    EarlyStopping(String),
+    ShapeMismatch(String),
+    InvalidParameter(String),
+    SystemError(String),
+}
+
+impl Error for BellandeError {}
+
+impl fmt::Display for BellandeError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            BellandeError::NoGradients => write!(f, "Gradients not enabled for this tensor"),
+            BellandeError::InvalidShape(msg) => write!(f, "Invalid tensor shape: {}", msg),
+            BellandeError::DimensionMismatch => write!(f, "Tensor dimensions do not match"),
+            BellandeError::InvalidBackward(msg) => write!(f, "Invalid backward operation: {}", msg),
+            BellandeError::DeviceNotAvailable => write!(f, "Requested device not available"),
+            BellandeError::InvalidDevice => write!(f, "Invalid device specification"),
+            BellandeError::SerializationError(msg) => write!(f, "Serialization error: {}", msg),
+            BellandeError::InvalidDataType => write!(f, "Invalid data type"),
+            BellandeError::InvalidInputs(msg) => write!(f, "Invalid number of inputs {}", msg),
+            BellandeError::IndexOutOfBounds => write!(f, "Index out of bounds"),
+            BellandeError::LockError => write!(f, "Lock error"),
+            BellandeError::CUDAError(msg) => write!(f, "CUDA error: {}", msg),
+            BellandeError::IOError(err) => write!(f, "IO error: {}", err),
+            BellandeError::RuntimeError(msg) => write!(f, "Runtime error: {}", msg),
+            BellandeError::ImageError(msg) => write!(f, "Image error: {}", msg),
+            BellandeError::InvalidOperation(msg) => write!(f, "Invalid operation: {}", msg),
+            BellandeError::InvalidConfiguration(msg) => write!(f, "Invalid configuration: {}", msg),
+            BellandeError::NotImplemented(msg) => write!(f, "Not implemented: {}", msg),
+            BellandeError::EarlyStopping(msg) => write!(f, "Early stopping: {}", msg),
+            BellandeError::ShapeMismatch(msg) => write!(f, "Shape mismatch: {}", msg),
+            BellandeError::InvalidParameter(msg) => write!(f, "Invalid parameter: {}", msg),
+            BellandeError::SystemError(msg) => write!(f, "System error: {}", msg),
+        }
+    }
+}
diff --git a/src/core/mod.rs b/src/core/mod.rs
new file mode 100644
index 0000000..dd211ca
--- /dev/null
+++ b/src/core/mod.rs
@@ -0,0 +1,6 @@
+pub mod autograd;
+pub mod device;
+pub mod dtype;
+pub mod error;
+pub mod random;
+pub mod tensor;
diff --git a/src/core/random.rs b/src/core/random.rs
new file mode 100644
index 0000000..63b2de5
--- /dev/null
+++ b/src/core/random.rs
@@ -0,0 +1,50 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use rand::prelude::*;
+use rand_distr::{Normal, Uniform};
+use std::cell::RefCell;
+
+thread_local! {
+    static GENERATOR: RefCell<StdRng> = RefCell::new(StdRng::from_entropy());
+}
+
+pub fn set_seed(seed: u64) {
+    GENERATOR.with(|g| {
+        *g.borrow_mut() = StdRng::seed_from_u64(seed);
+    });
+}
+
+pub fn normal(mean: f32, std: f32, size: usize) -> Vec<f32> {
+    let normal = Normal::new(mean as f64, std as f64).unwrap();
+    GENERATOR.with(|g| {
+        (0..size)
+            .map(|_| normal.sample(&mut *g.borrow_mut()) as f32)
+            .collect()
+    })
+}
+
+pub fn uniform(low: f32, high: f32, size: usize) -> Vec<f32> {
+    let uniform = Uniform::new(low, high);
+    GENERATOR.with(|g| {
+        (0..size)
+            .map(|_| uniform.sample(&mut *g.borrow_mut()))
+            .collect()
+    })
+}
+
+pub fn bernoulli(p: f32, size: usize) -> Vec<bool> {
+    GENERATOR.with(|g| (0..size).map(|_| g.borrow_mut().gen::<f32>() < p).collect())
+}
diff --git a/src/core/tensor.rs b/src/core/tensor.rs
new file mode 100644
index 0000000..7aef4b2
--- /dev/null
+++ b/src/core/tensor.rs
@@ -0,0 +1,709 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{
+    autograd::AutogradFunction, device::Device, dtype::DataType, error::BellandeError,
+};
+use std::ops::{Add, Mul, Sub};
+use std::sync::Arc;
+
+#[derive(Clone, Debug)]
+pub struct Tensor {
+    pub data: Vec<f32>,
+    pub shape: Vec<usize>,
+    pub requires_grad: bool,
+    pub grad: Option<Vec<f32>>,
+    pub grad_fn: Option<Arc<dyn AutogradFunction>>,
+    pub device: Device,
+    pub dtype: DataType,
+}
+
+impl Tensor {
+    pub fn new(
+        data: Vec<f32>,
+        shape: Vec<usize>,
+        requires_grad: bool,
+        device: Device,
+        dtype: DataType,
+    ) -> Self {
+        let size = shape.iter().product();
+        assert_eq!(data.len(), size, "Data size does not match shape");
+
+        Tensor {
+            data,
+            shape,
+            requires_grad,
+            grad: if requires_grad {
+                Some(vec![0.0; size])
+            } else {
+                None
+            },
+            grad_fn: None,
+            device,
+            dtype,
+        }
+    }
+
+    // Data access methods
+    pub fn data(&self) -> &[f32] {
+        &self.data
+    }
+
+    pub fn data_mut(&mut self) -> &mut [f32] {
+        &mut self.data
+    }
+
+    pub fn shape(&self) -> &[usize] {
+        &self.shape
+    }
+
+    // Calculate stride for the current shape
+    pub fn stride(&self) -> Vec<usize> {
+        let mut stride = Vec::with_capacity(self.shape.len());
+        let mut current_stride = 1;
+        for &dim in self.shape.iter().rev() {
+            stride.push(current_stride);
+            current_stride *= dim;
+        }
+        stride.reverse();
+        stride
+    }
+
+    pub fn add(&self, other: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.shape != other.shape {
+            return Err(BellandeError::ShapeMismatch(
+                "Tensors must have the same shape for addition".into(),
+            ));
+        }
+
+        let output: Vec<f32> = self
+            .data
+            .iter()
+            .zip(other.data.iter())
+            .map(|(&a, &b)| a + b)
+            .collect();
+
+        Ok(Tensor::new(
+            output,
+            self.shape.clone(),
+            self.requires_grad || other.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    pub fn get_device(&self) -> &Device {
+        &self.device
+    }
+
+    pub fn get_dtype(&self) -> &DataType {
+        &self.dtype
+    }
+
+    pub fn permute(&self, dims: &[usize]) -> Result<Tensor, BellandeError> {
+        if dims.len() != self.shape.len() {
+            return Err(BellandeError::InvalidShape(format!(
+                "Permutation dimensions must match tensor dimensions: expected {}, got {}",
+                self.shape.len(),
+                dims.len()
+            )));
+        }
+
+        let mut new_shape = vec![0; self.shape.len()];
+        for (i, &dim) in dims.iter().enumerate() {
+            if dim >= self.shape.len() {
+                return Err(BellandeError::InvalidShape(format!(
+                    "Invalid permutation dimension: {}",
+                    dim
+                )));
+            }
+            new_shape[i] = self.shape[dim];
+        }
+
+        let mut new_data = vec![0.0; self.data.len()];
+        let strides = self.compute_strides();
+        let new_strides = compute_strides(&new_shape);
+
+        for i in 0..self.data.len() {
+            let old_indices = get_indices(i, &strides, &self.shape);
+            let mut new_indices = vec![0; old_indices.len()];
+            for (j, &dim) in dims.iter().enumerate() {
+                new_indices[j] = old_indices[dim];
+            }
+            let new_idx = get_flat_index(&new_indices, &new_strides);
+            new_data[new_idx] = self.data[i];
+        }
+
+        Ok(Tensor::new(
+            new_data,
+            new_shape,
+            self.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    pub fn scale(&self, factor: f32) -> Result<Tensor, BellandeError> {
+        let new_data = self.data.iter().map(|&x| x * factor).collect();
+
+        Ok(Tensor::new(
+            new_data,
+            self.shape.clone(),
+            self.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    fn compute_strides(&self) -> Vec<usize> {
+        compute_strides(&self.shape)
+    }
+
+    pub fn zeros(shape: &[usize]) -> Self {
+        let size = shape.iter().product();
+        Tensor::new(
+            vec![0.0; size],
+            shape.to_vec(),
+            false,
+            Device::default(),
+            DataType::default(),
+        )
+    }
+
+    pub fn ones(shape: &[usize]) -> Self {
+        let size = shape.iter().product();
+        Tensor::new(
+            vec![1.0; size],
+            shape.to_vec(),
+            false,
+            Device::default(),
+            DataType::default(),
+        )
+    }
+
+    pub fn randn(shape: &[usize]) -> Self {
+        let size = shape.iter().product();
+        Tensor::new(
+            crate::core::random::normal(0.0, 1.0, size),
+            shape.to_vec(),
+            false,
+            Device::default(),
+            DataType::default(),
+        )
+    }
+
+    pub fn stack(tensors: &[Tensor]) -> Result<Tensor, BellandeError> {
+        if tensors.is_empty() {
+            return Err(BellandeError::InvalidInputs(format!("Invalid Inputs")))?;
+        }
+
+        let base_shape = tensors[0].shape();
+
+        // Verify all tensors have the same shape
+        for (i, tensor) in tensors.iter().enumerate().skip(1) {
+            if tensor.shape() != base_shape {
+                return Err(BellandeError::ShapeMismatch(format!(
+                    "tensor 0 has shape {:?} but tensor {} has shape {:?}",
+                    base_shape,
+                    i,
+                    tensor.shape()
+                )));
+            }
+        }
+
+        // Calculate new shape with batch dimension
+        let mut new_shape = vec![tensors.len()];
+        new_shape.extend(base_shape);
+
+        // Calculate total size
+        let total_size = new_shape.iter().product();
+        let batch_size: usize = base_shape.iter().fold(1, |acc, &x| acc * x);
+        let mut result_data = vec![0.0; total_size];
+
+        // Copy data from each tensor
+        for (i, tensor) in tensors.iter().enumerate() {
+            let start = i * batch_size;
+            let end = start + batch_size;
+            result_data[start..end].copy_from_slice(&tensor.data);
+        }
+
+        Ok(Tensor::new(
+            result_data,
+            new_shape,
+            tensors[0].requires_grad,
+            tensors[0].device.clone(),
+            tensors[0].dtype,
+        ))
+    }
+
+    pub fn copy_slice(&mut self, batch_idx: usize, source: &Tensor) -> Result<(), BellandeError> {
+        let strides = self.stride();
+        if strides.is_empty() {
+            return Err(BellandeError::InvalidShape("Empty tensor shape".into()));
+        }
+
+        let batch_stride = strides[0];
+        let start_idx = batch_idx * batch_stride;
+        let end_idx = start_idx + batch_stride;
+
+        if end_idx > self.data.len() {
+            return Err(BellandeError::IndexOutOfBounds);
+        }
+
+        // Check if source has correct size
+        if source.data.len() != batch_stride {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        self.data[start_idx..end_idx].copy_from_slice(&source.data);
+        Ok(())
+    }
+
+    pub fn backward(&mut self) -> Result<(), BellandeError> {
+        if !self.requires_grad {
+            return Err(BellandeError::NoGradients);
+        }
+
+        if self.grad.is_none() {
+            self.grad = Some(vec![1.0; self.data.len()]);
+        }
+
+        if let Some(ref grad_fn) = self.grad_fn {
+            if let Some(ref grad) = self.grad {
+                grad_fn.backward(&Tensor::new(
+                    grad.clone(),
+                    self.shape.clone(),
+                    false,
+                    self.device.clone(),
+                    self.dtype,
+                ))?;
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn matmul(&self, other: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.shape.len() != 2 || other.shape.len() != 2 {
+            return Err(BellandeError::InvalidShape(
+                "Tensors must be 2D for matmul".into(),
+            ));
+        }
+
+        let (m, k) = (self.shape[0], self.shape[1]);
+        let (k2, n) = (other.shape[0], other.shape[1]);
+
+        if k != k2 {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut result = vec![0.0; m * n];
+        for i in 0..m {
+            for j in 0..n {
+                let mut sum = 0.0;
+                for k in 0..k {
+                    sum += self.data[i * k + k] * other.data[k * n + j];
+                }
+                result[i * n + j] = sum;
+            }
+        }
+
+        Ok(Tensor::new(
+            result,
+            vec![m, n],
+            self.requires_grad || other.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    pub fn to_device(&self, device: &Device) -> Result<Tensor, BellandeError> {
+        Ok(Tensor {
+            data: self.data.clone(),
+            shape: self.shape.clone(),
+            requires_grad: self.requires_grad,
+            grad: self.grad.clone(),
+            grad_fn: self.grad_fn.clone(),
+            device: device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn t(&self) -> Result<Tensor, BellandeError> {
+        if self.shape.len() != 2 {
+            return Err(BellandeError::InvalidShape(
+                "Transpose only works on 2D tensors".to_string(),
+            ));
+        }
+        let (rows, cols) = (self.shape[0], self.shape[1]);
+        let mut transposed = vec![0.0; self.data.len()];
+
+        for i in 0..rows {
+            for j in 0..cols {
+                transposed[j * rows + i] = self.data[i * cols + j];
+            }
+        }
+
+        Ok(Tensor {
+            data: transposed,
+            shape: vec![cols, rows],
+            requires_grad: self.requires_grad,
+            grad: None,
+            grad_fn: None,
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn masked_fill(&self, mask: &Tensor, value: f32) -> Result<Tensor, BellandeError> {
+        if self.shape != mask.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut new_data = self.data.clone();
+        for (i, &mask_val) in mask.data.iter().enumerate() {
+            if mask_val != 0.0 {
+                new_data[i] = value;
+            }
+        }
+
+        Ok(Tensor {
+            data: new_data,
+            shape: self.shape.clone(),
+            requires_grad: self.requires_grad,
+            grad: None,
+            grad_fn: None,
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn softmax(&self, dim: i32) -> Result<Tensor, BellandeError> {
+        let dim = if dim < 0 {
+            (self.shape.len() as i32 + dim) as usize
+        } else {
+            dim as usize
+        };
+
+        if dim >= self.shape.len() {
+            return Err(BellandeError::RuntimeError(format!(
+                "Dimension out of range (expected to be in range of [-{}, {}], but got {})",
+                self.shape.len(),
+                self.shape.len() - 1,
+                dim
+            )));
+        }
+
+        let mut result = vec![0.0; self.data.len()];
+        let stride = self.get_stride(dim);
+        let outer_size = self.shape[..dim].iter().product::<usize>();
+        let inner_size = self.shape[dim + 1..].iter().product::<usize>();
+        let dim_size = self.shape[dim];
+
+        for outer in 0..outer_size {
+            for inner in 0..inner_size {
+                // Find max for numerical stability
+                let mut max_val = f32::NEG_INFINITY;
+                for d in 0..dim_size {
+                    let idx = outer * stride * dim_size + d * stride + inner;
+                    max_val = max_val.max(self.data[idx]);
+                }
+
+                // Compute exponentials and sum
+                let mut sum = 0.0;
+                for d in 0..dim_size {
+                    let idx = outer * stride * dim_size + d * stride + inner;
+                    let exp_val = (self.data[idx] - max_val).exp();
+                    result[idx] = exp_val;
+                    sum += exp_val;
+                }
+
+                // Normalize
+                for d in 0..dim_size {
+                    let idx = outer * stride * dim_size + d * stride + inner;
+                    result[idx] /= sum;
+                }
+            }
+        }
+
+        Ok(Tensor {
+            data: result,
+            shape: self.shape.clone(),
+            requires_grad: self.requires_grad,
+            grad: None,
+            grad_fn: None,
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    // Helper method for softmax
+    fn get_stride(&self, dim: usize) -> usize {
+        let mut stride = 1;
+        for d in dim + 1..self.shape.len() {
+            stride *= self.shape[d];
+        }
+        stride
+    }
+
+    pub fn sum_dim(&self, dim: usize, keepdim: bool) -> Result<Tensor, BellandeError> {
+        if dim >= self.shape.len() {
+            return Err(BellandeError::InvalidShape(format!(
+                "Dimension {} out of bounds",
+                dim
+            )));
+        }
+
+        let mut new_shape = self.shape.clone();
+        if !keepdim {
+            new_shape.remove(dim);
+        } else {
+            new_shape[dim] = 1;
+        }
+
+        let stride: usize = self.shape[dim..].iter().product();
+        let outer_stride: usize = self.shape[..dim].iter().product();
+        let inner_size: usize = stride / self.shape[dim];
+        let mut result = vec![0.0; new_shape.iter().product()];
+
+        for i in 0..outer_stride {
+            for k in 0..inner_size {
+                let mut sum = 0.0;
+                for j in 0..self.shape[dim] {
+                    let idx = i * stride + j * inner_size + k;
+                    sum += self.data[idx];
+                }
+                result[i * inner_size + k] = sum;
+            }
+        }
+
+        Ok(Tensor {
+            data: result,
+            shape: new_shape,
+            requires_grad: self.requires_grad,
+            grad: None,
+            grad_fn: None,
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn sum_all_dims(&self) -> Result<Tensor, BellandeError> {
+        let sum = self.data.iter().sum();
+        Ok(Tensor {
+            data: vec![sum],
+            shape: vec![1],
+            requires_grad: self.requires_grad,
+            grad: None,
+            grad_fn: None,
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn reshape(&self, new_shape: &[usize]) -> Result<Tensor, BellandeError> {
+        let new_size: usize = new_shape.iter().product();
+        if new_size != self.data.len() {
+            return Err(BellandeError::InvalidShape(format!(
+                "Cannot reshape tensor of size {} to shape {:?}",
+                self.data.len(),
+                new_shape
+            )));
+        }
+
+        Ok(Tensor {
+            data: self.data.clone(),
+            shape: new_shape.to_vec(),
+            requires_grad: self.requires_grad,
+            grad: self.grad.clone(),
+            grad_fn: self.grad_fn.clone(),
+            device: self.device.clone(),
+            dtype: self.dtype,
+        })
+    }
+
+    pub fn mul(&self, other: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.shape != other.shape {
+            return Err(BellandeError::ShapeMismatch(
+                "Shapes must match for element-wise multiplication".into(),
+            ));
+        }
+
+        let new_data: Vec<f32> = self
+            .data
+            .iter()
+            .zip(other.data.iter())
+            .map(|(&a, &b)| a * b)
+            .collect();
+
+        Ok(Tensor::new(
+            new_data,
+            self.shape.clone(),
+            self.requires_grad || other.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    pub fn transpose(&self) -> Result<Tensor, BellandeError> {
+        if self.shape.len() != 2 {
+            return Err(BellandeError::InvalidShape(
+                "Transpose requires a 2D tensor".into(),
+            ));
+        }
+
+        let (rows, cols) = (self.shape[0], self.shape[1]);
+        let mut new_data = vec![0.0; self.data.len()];
+
+        for i in 0..rows {
+            for j in 0..cols {
+                new_data[j * rows + i] = self.data[i * cols + j];
+            }
+        }
+
+        Ok(Tensor::new(
+            new_data,
+            vec![cols, rows],
+            self.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    pub fn narrow(&self, dim: usize, start: usize, length: usize) -> Result<Tensor, BellandeError> {
+        if dim >= self.shape.len() {
+            return Err(BellandeError::InvalidShape(format!(
+                "Dimension {} out of range for tensor with {} dimensions",
+                dim,
+                self.shape.len()
+            )));
+        }
+
+        if start + length > self.shape[dim] {
+            return Err(BellandeError::InvalidShape(
+                "Narrow operation out of bounds".into(),
+            ));
+        }
+
+        let mut new_shape = self.shape.clone();
+        new_shape[dim] = length;
+
+        let mut new_data = Vec::new();
+        let stride = self.get_stride(dim);
+
+        // Collect the narrowed data
+        for i in 0..self.data.len() {
+            let dim_idx = (i / stride) % self.shape[dim];
+            if dim_idx >= start && dim_idx < start + length {
+                new_data.push(self.data[i]);
+            }
+        }
+
+        Ok(Tensor::new(
+            new_data,
+            new_shape,
+            self.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    // Hyperbolic tangent
+    pub fn tanh(&self) -> Result<Tensor, BellandeError> {
+        let new_data: Vec<f32> = self.data.iter().map(|&x| x.tanh()).collect();
+
+        Ok(Tensor::new(
+            new_data,
+            self.shape.clone(),
+            self.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+
+    // Element-wise subtraction
+    pub fn sub(&self, other: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.shape != other.shape {
+            return Err(BellandeError::ShapeMismatch(
+                "Shapes must match for subtraction".into(),
+            ));
+        }
+
+        let new_data: Vec<f32> = self
+            .data
+            .iter()
+            .zip(other.data.iter())
+            .map(|(&a, &b)| a - b)
+            .collect();
+
+        Ok(Tensor::new(
+            new_data,
+            self.shape.clone(),
+            self.requires_grad || other.requires_grad,
+            self.device.clone(),
+            self.dtype,
+        ))
+    }
+}
+
+fn compute_strides(shape: &[usize]) -> Vec<usize> {
+    let mut strides = vec![1; shape.len()];
+    for i in (0..shape.len() - 1).rev() {
+        strides[i] = strides[i + 1] * shape[i + 1];
+    }
+    strides
+}
+
+fn get_indices(flat_idx: usize, strides: &[usize], shape: &[usize]) -> Vec<usize> {
+    let mut indices = vec![0; shape.len()];
+    let mut remaining = flat_idx;
+    for i in 0..shape.len() {
+        indices[i] = remaining / strides[i];
+        remaining %= strides[i];
+    }
+    indices
+}
+
+fn get_flat_index(indices: &[usize], strides: &[usize]) -> usize {
+    indices
+        .iter()
+        .zip(strides.iter())
+        .map(|(&idx, &stride)| idx * stride)
+        .sum()
+}
+
+impl Add for &Tensor {
+    type Output = Result<Tensor, BellandeError>;
+
+    fn add(self, other: &Tensor) -> Self::Output {
+        self.add(other)
+    }
+}
+
+impl Mul for &Tensor {
+    type Output = Result<Tensor, BellandeError>;
+
+    fn mul(self, other: &Tensor) -> Self::Output {
+        self.mul(other)
+    }
+}
+
+impl Sub for &Tensor {
+    type Output = Result<Tensor, BellandeError>;
+
+    fn sub(self, other: &Tensor) -> Self::Output {
+        self.sub(other)
+    }
+}
diff --git a/src/data/augmentation.rs b/src/data/augmentation.rs
new file mode 100644
index 0000000..dca01c3
--- /dev/null
+++ b/src/data/augmentation.rs
@@ -0,0 +1,113 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+use rand::Rng;
+
+pub trait Transform: Send + Sync {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError>;
+}
+
+pub struct Compose {
+    transforms: Vec<Box<dyn Transform>>,
+}
+
+impl Compose {
+    pub fn new(transforms: Vec<Box<dyn Transform>>) -> Self {
+        Compose { transforms }
+    }
+}
+
+impl Transform for Compose {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        let mut current = tensor.clone();
+        for transform in &self.transforms {
+            current = transform.apply(&current)?;
+        }
+        Ok(current)
+    }
+}
+
+pub struct RandomHorizontalFlip {
+    p: f32,
+}
+
+impl RandomHorizontalFlip {
+    pub fn new(p: f32) -> Self {
+        assert!(p >= 0.0 && p <= 1.0);
+        RandomHorizontalFlip { p }
+    }
+}
+
+impl Transform for RandomHorizontalFlip {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        if tensor.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(format!(
+                "Transform Invalid Shape"
+            )));
+        }
+
+        let mut rng = rand::thread_rng();
+        if rng.gen::<f32>() > self.p {
+            return Ok(tensor.clone());
+        }
+
+        let (batch_size, channels, height, width) = (
+            tensor.shape[0],
+            tensor.shape[1],
+            tensor.shape[2],
+            tensor.shape[3],
+        );
+
+        let mut flipped_data = vec![0.0; tensor.data.len()];
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..height {
+                    for w in 0..width {
+                        let src_idx = ((b * channels + c) * height + h) * width + w;
+                        let dst_idx = ((b * channels + c) * height + h) * width + (width - 1 - w);
+                        flipped_data[dst_idx] = tensor.data[src_idx];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            flipped_data,
+            tensor.shape.clone(),
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+}
+
+pub struct RandomRotation {
+    degrees: (f32, f32),
+}
+
+impl RandomRotation {
+    pub fn new(degrees: (f32, f32)) -> Self {
+        RandomRotation { degrees }
+    }
+}
+
+impl Transform for RandomRotation {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        // Implementation for random rotation
+        unimplemented!()
+    }
+}
diff --git a/src/data/dataloader.rs b/src/data/dataloader.rs
new file mode 100644
index 0000000..a0a9f22
--- /dev/null
+++ b/src/data/dataloader.rs
@@ -0,0 +1,193 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::error::BellandeError;
+use crate::core::tensor::Tensor;
+use crate::data::{dataset::Dataset, sampler::Sampler};
+use rayon::prelude::*;
+use std::sync::{Arc, Mutex};
+
+pub struct DataLoader {
+    dataset: Arc<Box<dyn Dataset>>,
+    batch_size: usize,
+    shuffle: bool,
+    num_workers: usize,
+    sampler: Option<Arc<Mutex<Box<dyn Sampler>>>>,
+    drop_last: bool,
+}
+
+impl DataLoader {
+    pub fn new(
+        dataset: Box<dyn Dataset>,
+        batch_size: usize,
+        shuffle: bool,
+        num_workers: usize,
+        sampler: Option<Box<dyn Sampler>>,
+        drop_last: bool,
+    ) -> Self {
+        DataLoader {
+            dataset: Arc::new(dataset),
+            batch_size,
+            shuffle,
+            num_workers,
+            sampler: sampler.map(|s| Arc::new(Mutex::new(s))),
+            drop_last,
+        }
+    }
+
+    pub fn iter(&self) -> DataLoaderIterator {
+        DataLoaderIterator {
+            dataloader: self,
+            index: 0,
+        }
+    }
+}
+
+pub struct DataLoaderIterator<'a> {
+    dataloader: &'a DataLoader,
+    index: usize,
+}
+
+impl<'a> Iterator for DataLoaderIterator<'a> {
+    type Item = Result<(Tensor, Tensor), BellandeError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.dataloader.dataset.len() {
+            return None;
+        }
+
+        let batch_indices: Vec<usize> = if let Some(sampler) = &self.dataloader.sampler {
+            match sampler.lock() {
+                Ok(mut sampler) => sampler.sample(self.dataloader.batch_size),
+                Err(_) => return Some(Err(BellandeError::LockError)),
+            }
+        } else if self.dataloader.shuffle {
+            use rand::seq::SliceRandom;
+            let mut rng = rand::thread_rng();
+            let mut indices: Vec<usize> = (0..self.dataloader.dataset.len()).collect();
+            indices.shuffle(&mut rng);
+            indices[..self.dataloader.batch_size.min(indices.len())].to_vec()
+        } else {
+            let end = (self.index + self.dataloader.batch_size).min(self.dataloader.dataset.len());
+            (self.index..end).collect()
+        };
+
+        if batch_indices.is_empty()
+            || (self.dataloader.drop_last && batch_indices.len() < self.dataloader.batch_size)
+        {
+            return None;
+        }
+
+        let batch: Vec<(Tensor, Tensor)> = if self.dataloader.num_workers > 1 {
+            batch_indices
+                .par_iter()
+                .map(|&idx| self.dataloader.dataset.get(idx))
+                .collect()
+        } else {
+            batch_indices
+                .iter()
+                .map(|&idx| self.dataloader.dataset.get(idx))
+                .collect()
+        };
+
+        self.index += self.dataloader.batch_size;
+
+        if batch.is_empty() {
+            None
+        } else {
+            Some(collate_batch(batch))
+        }
+    }
+}
+
+fn get_batch_shape(tensors: &[Tensor]) -> Result<Vec<usize>, BellandeError> {
+    if tensors.is_empty() {
+        return Err(BellandeError::InvalidInputs(
+            "Empty tensor batch".to_string(),
+        ));
+    }
+
+    let base_shape = tensors[0].shape();
+
+    // Verify all tensors have the same shape
+    for (i, tensor) in tensors.iter().enumerate().skip(1) {
+        if tensor.shape() != base_shape {
+            return Err(BellandeError::ShapeMismatch(format!(
+                "tensor 0 has shape {:?} but tensor {} has shape {:?}",
+                base_shape,
+                i,
+                tensor.shape()
+            )));
+        }
+    }
+
+    // Create the batch shape: [batch_size, ...base_shape]
+    let mut batch_shape = vec![tensors.len()];
+    batch_shape.extend(base_shape);
+    Ok(batch_shape)
+}
+
+fn collate_batch(batch: Vec<(Tensor, Tensor)>) -> Result<(Tensor, Tensor), BellandeError> {
+    if batch.is_empty() {
+        return Err(BellandeError::InvalidInputs(
+            "Empty batch provided".to_string(),
+        ));
+    }
+
+    // Split the batch into data and labels
+    let (data_tensors, label_tensors): (Vec<Tensor>, Vec<Tensor>) = batch.into_iter().unzip();
+
+    // Get shapes for data and labels
+    let data_shape = get_batch_shape(&data_tensors)?;
+    let label_shape = get_batch_shape(&label_tensors)?;
+
+    // Create storage for batched data
+    let mut batched_data = Tensor::zeros(&data_shape);
+    let mut batched_labels = Tensor::zeros(&label_shape);
+
+    // Copy data into the batched tensor
+    for (i, data) in data_tensors.iter().enumerate() {
+        copy_tensor_slice(&mut batched_data, i, data)?;
+    }
+
+    // Copy labels into the batched tensor
+    for (i, label) in label_tensors.iter().enumerate() {
+        copy_tensor_slice(&mut batched_labels, i, label)?;
+    }
+
+    Ok((batched_data, batched_labels))
+}
+
+fn copy_tensor_slice(
+    dest: &mut Tensor,
+    batch_idx: usize,
+    source: &Tensor,
+) -> Result<(), BellandeError> {
+    let batch_stride = dest.stride()[0];
+    let start_idx = batch_idx * batch_stride;
+    let end_idx = start_idx + batch_stride;
+
+    if end_idx > dest.data().len() {
+        return Err(BellandeError::IndexOutOfBounds);
+    }
+
+    if source.data().len() != batch_stride {
+        return Err(BellandeError::DimensionMismatch);
+    }
+
+    let dest_slice = &mut dest.data_mut()[start_idx..end_idx];
+    dest_slice.copy_from_slice(source.data());
+    Ok(())
+}
diff --git a/src/data/dataset.rs b/src/data/dataset.rs
new file mode 100644
index 0000000..86c6bd5
--- /dev/null
+++ b/src/data/dataset.rs
@@ -0,0 +1,24 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::tensor::Tensor;
+
+pub trait Dataset: Send + Sync {
+    fn len(&self) -> usize;
+    fn get(&self, index: usize) -> (Tensor, Tensor);
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
diff --git a/src/data/image_decoder.rs b/src/data/image_decoder.rs
new file mode 100644
index 0000000..f24a47d
--- /dev/null
+++ b/src/data/image_decoder.rs
@@ -0,0 +1,338 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, dtype::DataType, error::BellandeError, tensor::Tensor};
+use std::collections::HashMap;
+use std::io::Read;
+use std::path::{Path, PathBuf};
+
+/// Basic image format detector
+#[derive(Debug, PartialEq)]
+enum ImageFormat {
+    JPEG,
+    PNG,
+    Unknown,
+}
+
+/// RGB pixel structure
+#[derive(Clone, Copy, Debug)]
+struct RGB {
+    r: u8,
+    g: u8,
+    b: u8,
+}
+
+/// Image decoder implementation
+pub struct ImageDecoder {
+    width: usize,
+    height: usize,
+    channels: usize,
+    data: Vec<u8>,
+}
+
+impl ImageDecoder {
+    /// Creates a new image decoder
+    pub fn new(bytes: &[u8]) -> Result<Self, BellandeError> {
+        let format = Self::detect_format(bytes)?;
+        match format {
+            ImageFormat::JPEG => Self::decode_jpeg(bytes),
+            ImageFormat::PNG => Self::decode_png(bytes),
+            ImageFormat::Unknown => Err(BellandeError::ImageError(
+                "Unsupported image format".to_string(),
+            )),
+        }
+    }
+
+    /// Detects the image format from magic bytes
+    fn detect_format(bytes: &[u8]) -> Result<ImageFormat, BellandeError> {
+        if bytes.len() < 4 {
+            return Err(BellandeError::ImageError("Invalid image data".to_string()));
+        }
+
+        match &bytes[0..4] {
+            [0xFF, 0xD8, 0xFF, _] => Ok(ImageFormat::JPEG),
+            [0x89, 0x50, 0x4E, 0x47] => Ok(ImageFormat::PNG),
+            _ => Ok(ImageFormat::Unknown),
+        }
+    }
+
+    /// Basic JPEG decoder implementation
+    fn decode_jpeg(bytes: &[u8]) -> Result<Self, BellandeError> {
+        // This is a basic implementation - you'll need to implement full JPEG decoding
+        let mut reader = std::io::Cursor::new(bytes);
+        let mut marker = [0u8; 2];
+
+        // Find SOF0 marker (Start Of Frame)
+        loop {
+            reader.read_exact(&mut marker).map_err(|e| {
+                BellandeError::ImageError(format!("Failed to read JPEG marker: {}", e))
+            })?;
+
+            if marker[0] != 0xFF {
+                return Err(BellandeError::ImageError("Invalid JPEG marker".to_string()));
+            }
+
+            match marker[1] {
+                0xC0 => break, // SOF0 marker
+                0xD9 => return Err(BellandeError::ImageError("Reached end of JPEG".to_string())),
+                _ => {
+                    let mut length = [0u8; 2];
+                    reader.read_exact(&mut length).map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read length: {}", e))
+                    })?;
+                    let length = u16::from_be_bytes(length) as u64 - 2;
+                    reader.set_position(reader.position() + length);
+                }
+            }
+        }
+
+        // Read image dimensions
+        let mut header = [0u8; 5];
+        reader
+            .read_exact(&mut header)
+            .map_err(|e| BellandeError::ImageError(format!("Failed to read SOF0 header: {}", e)))?;
+
+        let height = u16::from_be_bytes([header[1], header[2]]) as usize;
+        let width = u16::from_be_bytes([header[3], header[4]]) as usize;
+        let channels = 3; // Assume RGB
+
+        // Create placeholder data (you'll need to implement actual JPEG decoding)
+        let data = vec![0u8; width * height * channels];
+
+        Ok(Self {
+            width,
+            height,
+            channels,
+            data,
+        })
+    }
+
+    /// Basic PNG decoder implementation
+    fn decode_png(bytes: &[u8]) -> Result<Self, BellandeError> {
+        // This is a basic implementation - you'll need to implement full PNG decoding
+        let mut reader = std::io::Cursor::new(bytes);
+        let mut header = [0u8; 8];
+
+        // Skip PNG signature
+        reader
+            .read_exact(&mut header)
+            .map_err(|e| BellandeError::ImageError(format!("Failed to read PNG header: {}", e)))?;
+
+        // Read IHDR chunk
+        let mut length = [0u8; 4];
+        reader.read_exact(&mut length).map_err(|e| {
+            BellandeError::ImageError(format!("Failed to read chunk length: {}", e))
+        })?;
+
+        let mut ihdr = [0u8; 8];
+        reader
+            .read_exact(&mut ihdr)
+            .map_err(|e| BellandeError::ImageError(format!("Failed to read IHDR: {}", e)))?;
+
+        let width = u32::from_be_bytes([ihdr[0], ihdr[1], ihdr[2], ihdr[3]]) as usize;
+        let height = u32::from_be_bytes([ihdr[4], ihdr[5], ihdr[6], ihdr[7]]) as usize;
+        let channels = 3; // Assume RGB
+
+        // Create placeholder data (you'll need to implement actual PNG decoding)
+        let data = vec![0u8; width * height * channels];
+
+        Ok(Self {
+            width,
+            height,
+            channels,
+            data,
+        })
+    }
+
+    /// Converts image data to tensor
+    pub fn to_tensor(&self) -> Result<Tensor, BellandeError> {
+        let mut tensor_data = Vec::with_capacity(self.width * self.height * self.channels);
+
+        // Convert u8 to f32 and normalize to [0, 1]
+        for &byte in &self.data {
+            tensor_data.push(f32::from(byte) / 255.0);
+        }
+
+        Ok(Tensor::new(
+            tensor_data,
+            vec![1, self.channels, self.height, self.width],
+            false,
+            Device::CPU,
+            DataType::Float32,
+        ))
+    }
+
+    /// Resizes the image to specified dimensions
+    pub fn resize(&mut self, new_width: usize, new_height: usize) -> Result<(), BellandeError> {
+        if new_width == self.width && new_height == self.height {
+            return Ok(());
+        }
+
+        let mut new_data = vec![0u8; new_width * new_height * self.channels];
+
+        // Simple bilinear interpolation
+        for y in 0..new_height {
+            for x in 0..new_width {
+                let src_x = (x as f32 * self.width as f32 / new_width as f32).floor() as usize;
+                let src_y = (y as f32 * self.height as f32 / new_height as f32).floor() as usize;
+
+                for c in 0..self.channels {
+                    let src_idx = (src_y * self.width + src_x) * self.channels + c;
+                    let dst_idx = (y * new_width + x) * self.channels + c;
+                    new_data[dst_idx] = self.data[src_idx];
+                }
+            }
+        }
+
+        self.width = new_width;
+        self.height = new_height;
+        self.data = new_data;
+
+        Ok(())
+    }
+}
+
+// Update ImageFolder implementation to use the decoder
+pub struct ImageFolder {
+    path: PathBuf,
+    cache: HashMap<PathBuf, Tensor>,
+    supported_extensions: Vec<String>,
+}
+
+impl ImageFolder {
+    /// Creates a new ImageFolder instance
+    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, BellandeError> {
+        let path = path.as_ref().to_path_buf();
+
+        if !path.exists() {
+            return Err(BellandeError::ImageError(format!(
+                "Image folder does not exist: {}",
+                path.display()
+            )));
+        }
+
+        if !path.is_dir() {
+            return Err(BellandeError::ImageError(format!(
+                "Path is not a directory: {}",
+                path.display()
+            )));
+        }
+
+        Ok(Self {
+            path,
+            cache: HashMap::new(),
+            supported_extensions: vec!["jpg".to_string(), "jpeg".to_string(), "png".to_string()],
+        })
+    }
+
+    /// Decodes an image from bytes
+    fn decode_image(bytes: &[u8]) -> Result<Tensor, BellandeError> {
+        let mut decoder = ImageDecoder::new(bytes)?;
+
+        // Resize to standard dimensions if needed
+        if decoder.width != 224 || decoder.height != 224 {
+            decoder.resize(224, 224)?;
+        }
+
+        decoder.to_tensor()
+    }
+
+    /// Loads an image from a file path
+    pub fn load_image<P: AsRef<Path>>(&mut self, image_path: P) -> Result<Tensor, BellandeError> {
+        let path = image_path.as_ref().to_path_buf();
+
+        // Check cache first
+        if let Some(tensor) = self.cache.get(&path) {
+            return Ok(tensor.clone());
+        }
+
+        if !path.exists() {
+            return Err(BellandeError::ImageError(format!(
+                "Image file does not exist: {}",
+                path.display()
+            )));
+        }
+
+        // Verify file extension
+        if let Some(ext) = path.extension() {
+            if !self
+                .supported_extensions
+                .iter()
+                .any(|e| e == &ext.to_string_lossy())
+            {
+                return Err(BellandeError::ImageError(format!(
+                    "Unsupported image format: {}",
+                    path.display()
+                )));
+            }
+        }
+
+        let bytes = std::fs::read(&path).map_err(|e| {
+            BellandeError::ImageError(format!(
+                "Failed to read image file {}: {}",
+                path.display(),
+                e
+            ))
+        })?;
+
+        let tensor = Self::decode_image(&bytes)?;
+
+        // Cache the result
+        self.cache.insert(path, tensor.clone());
+
+        Ok(tensor)
+    }
+
+    /// Lists all images in the folder
+    pub fn list_images(&self) -> Result<Vec<PathBuf>, BellandeError> {
+        let mut images = Vec::new();
+
+        for entry in std::fs::read_dir(&self.path).map_err(|e| {
+            BellandeError::ImageError(format!(
+                "Failed to read directory {}: {}",
+                self.path.display(),
+                e
+            ))
+        })? {
+            let entry = entry.map_err(|e| {
+                BellandeError::ImageError(format!("Failed to read directory entry: {}", e))
+            })?;
+
+            let path = entry.path();
+
+            if let Some(ext) = path.extension() {
+                if self
+                    .supported_extensions
+                    .iter()
+                    .any(|e| e == &ext.to_string_lossy())
+                {
+                    images.push(path);
+                }
+            }
+        }
+
+        Ok(images)
+    }
+
+    /// Clears the image cache
+    pub fn clear_cache(&mut self) {
+        self.cache.clear();
+    }
+
+    /// Gets the base path of the image folder
+    pub fn path(&self) -> &Path {
+        &self.path
+    }
+}
diff --git a/src/data/image_folder.rs b/src/data/image_folder.rs
new file mode 100644
index 0000000..1e4d01e
--- /dev/null
+++ b/src/data/image_folder.rs
@@ -0,0 +1,959 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, dtype::DataType, error::BellandeError, tensor::Tensor};
+use crate::data::augmentation::Transform;
+use crate::utilities::byte::{BigEndian, ReadBytes};
+use crate::utilities::compression::Decoder;
+use std::collections::HashMap;
+use std::fs::{self, File};
+use std::io::{self, Cursor, Read, Seek, SeekFrom};
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::sync::RwLock;
+
+/// Implementation of From trait for error conversion
+impl From<std::io::Error> for BellandeError {
+    fn from(error: std::io::Error) -> Self {
+        BellandeError::IOError(error.to_string())
+    }
+}
+
+/// A reader that allows reading individual bits from a byte stream
+pub struct BitReader<R: Read> {
+    reader: R,
+    buffer: u8,
+    bits_remaining: u8,
+}
+
+/// Image format enumeration
+#[derive(Debug, Clone, Copy, PartialEq)]
+enum ImageFormat {
+    JPEG,
+    PNG,
+    Unknown,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct RGBPixel {
+    r: u8,
+    g: u8,
+    b: u8,
+}
+
+impl RGBPixel {
+    fn new(r: u8, g: u8, b: u8) -> Self {
+        RGBPixel { r, g, b }
+    }
+}
+
+/// Trait defining the interface for datasets
+pub trait Dataset: Send + Sync {
+    fn len(&self) -> usize;
+    fn get(&self, index: usize) -> Result<(Tensor, Tensor), BellandeError>;
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+    fn num_classes(&self) -> usize;
+}
+
+/// Structure for managing image datasets organized in folders
+pub struct ImageFolder {
+    root: PathBuf,
+    samples: Vec<(PathBuf, usize)>,
+    transform: Option<Box<dyn Transform>>,
+    target_transform: Option<Box<dyn Transform>>,
+    class_to_idx: HashMap<String, usize>,
+    cache: Option<RwLock<HashMap<PathBuf, Arc<Tensor>>>>,
+    cache_size: usize,
+}
+
+impl<R: Read> BitReader<R> {
+    /// Creates a new BitReader from a byte stream
+    pub fn new(reader: R) -> Self {
+        Self {
+            reader,
+            buffer: 0,
+            bits_remaining: 0,
+        }
+    }
+
+    /// Reads a single bit from the stream
+    pub fn read_bit(&mut self) -> io::Result<bool> {
+        if self.bits_remaining == 0 {
+            let mut byte = [0u8; 1];
+            self.reader.read_exact(&mut byte)?;
+            self.buffer = byte[0];
+            self.bits_remaining = 8;
+        }
+
+        self.bits_remaining -= 1;
+        Ok(((self.buffer >> self.bits_remaining) & 1) == 1)
+    }
+
+    /// Reads multiple bits and returns them as a u32
+    pub fn read_bits(&mut self, mut count: u8) -> io::Result<u32> {
+        let mut result = 0u32;
+
+        while count > 0 {
+            result = (result << 1) | (if self.read_bit()? { 1 } else { 0 });
+            count -= 1;
+        }
+
+        Ok(result)
+    }
+}
+
+impl ImageFolder {
+    const JPEG_NATURAL_ORDER: [usize; 64] = [
+        0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27,
+        20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+        58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,
+    ];
+
+    /// Creates a new ImageFolder dataset
+    pub fn new(
+        root: PathBuf,
+        transform: Option<Box<dyn Transform>>,
+        target_transform: Option<Box<dyn Transform>>,
+    ) -> Result<Self, BellandeError> {
+        let mut samples = Vec::new();
+        let mut class_to_idx = HashMap::new();
+
+        Self::validate_root_directory(&root)?;
+        Self::scan_directory(&root, &mut samples, &mut class_to_idx)?;
+
+        if samples.is_empty() {
+            return Err(BellandeError::IOError("No valid images found".to_string()));
+        }
+
+        Ok(ImageFolder {
+            root,
+            samples,
+            transform,
+            target_transform,
+            class_to_idx,
+            cache: Some(RwLock::new(HashMap::new())),
+            cache_size: 1000,
+        })
+    }
+
+    /// Creates a new ImageFolder with specified cache size
+    pub fn with_cache_size(
+        root: PathBuf,
+        transform: Option<Box<dyn Transform>>,
+        target_transform: Option<Box<dyn Transform>>,
+        cache_size: usize,
+    ) -> Result<Self, BellandeError> {
+        let mut folder = Self::new(root, transform, target_transform)?;
+        folder.cache_size = cache_size;
+        Ok(folder)
+    }
+
+    /// Validates the root directory exists and is a directory
+    fn validate_root_directory(root: &PathBuf) -> Result<(), BellandeError> {
+        if !root.exists() || !root.is_dir() {
+            return Err(BellandeError::IOError("Invalid root directory".to_string()));
+        }
+        Ok(())
+    }
+
+    /// Scans the directory structure and builds the dataset
+    fn scan_directory(
+        root: &PathBuf,
+        samples: &mut Vec<(PathBuf, usize)>,
+        class_to_idx: &mut HashMap<String, usize>,
+    ) -> Result<(), BellandeError> {
+        for (idx, entry) in fs::read_dir(root)?.enumerate() {
+            let entry = entry?;
+            let path = entry.path();
+
+            if path.is_dir() {
+                let class_name = path
+                    .file_name()
+                    .ok_or_else(|| {
+                        BellandeError::IOError("Invalid class directory name".to_string())
+                    })?
+                    .to_string_lossy()
+                    .into_owned();
+
+                class_to_idx.insert(class_name, idx);
+                Self::scan_images(&path, idx, samples)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Scans for images in a directory
+    fn scan_images(
+        path: &PathBuf,
+        class_idx: usize,
+        samples: &mut Vec<(PathBuf, usize)>,
+    ) -> Result<(), BellandeError> {
+        for entry in fs::read_dir(path)? {
+            let entry = entry?;
+            let path = entry.path();
+
+            if path.is_file() && Self::is_valid_image(&path) {
+                samples.push((path, class_idx));
+            } else if path.is_dir() {
+                Self::scan_images(&path, class_idx, samples)?;
+            }
+        }
+        Ok(())
+    }
+
+    /// Checks if a file is a valid image based on its extension and header
+    fn is_valid_image(path: &PathBuf) -> bool {
+        if let Some(ext) = path.extension() {
+            let ext = ext.to_string_lossy().to_lowercase();
+            if matches!(ext.as_str(), "jpg" | "jpeg" | "png") {
+                if let Ok(bytes) = Self::read_image_file(path) {
+                    return Self::detect_image_format(&bytes) != ImageFormat::Unknown;
+                }
+            }
+        }
+        false
+    }
+
+    /// Reads an image file to bytes
+    fn read_image_file(path: &PathBuf) -> Result<Vec<u8>, BellandeError> {
+        let mut file = File::open(path)
+            .map_err(|e| BellandeError::IOError(format!("Failed to open image file: {}", e)))?;
+
+        let mut bytes = Vec::new();
+        file.read_to_end(&mut bytes)
+            .map_err(|e| BellandeError::IOError(format!("Failed to read image file: {}", e)))?;
+
+        Ok(bytes)
+    }
+
+    /// Detects image format from bytes
+    fn detect_image_format(bytes: &[u8]) -> ImageFormat {
+        if bytes.len() < 4 {
+            return ImageFormat::Unknown;
+        }
+
+        match &bytes[0..4] {
+            [0xFF, 0xD8, 0xFF, _] => ImageFormat::JPEG,
+            [0x89, 0x50, 0x4E, 0x47] => ImageFormat::PNG,
+            _ => ImageFormat::Unknown,
+        }
+    }
+
+    /// Decodes image bytes to RGB pixels
+    fn decode_image_to_rgb(bytes: &[u8]) -> Result<(Vec<RGBPixel>, usize, usize), BellandeError> {
+        match Self::detect_image_format(bytes) {
+            ImageFormat::JPEG => Self::decode_jpeg(bytes),
+            ImageFormat::PNG => Self::decode_png(bytes),
+            ImageFormat::Unknown => Err(BellandeError::ImageError(
+                "Unknown image format".to_string(),
+            )),
+        }
+    }
+
+    fn decode_jpeg(bytes: &[u8]) -> Result<(Vec<RGBPixel>, usize, usize), BellandeError> {
+        let mut cursor = Cursor::new(bytes);
+        let mut marker = [0u8; 2];
+
+        // Verify JPEG signature (0xFFD8)
+        cursor
+            .read_exact(&mut marker)
+            .map_err(|e| BellandeError::ImageError(format!("Invalid JPEG header: {}", e)))?;
+
+        if marker != [0xFF, 0xD8] {
+            return Err(BellandeError::ImageError(
+                "Not a valid JPEG file".to_string(),
+            ));
+        }
+
+        let mut width = 0;
+        let mut height = 0;
+        let mut components = 0;
+        let mut quantization_tables: HashMap<u8, Vec<u8>> = HashMap::new();
+        let mut huffman_tables: HashMap<(u8, u8), Vec<u8>> = HashMap::new();
+
+        loop {
+            cursor.read_exact(&mut marker)?;
+
+            if marker[0] != 0xFF {
+                return Err(BellandeError::ImageError("Invalid marker".to_string()));
+            }
+
+            match marker[1] {
+                0xC0 => {
+                    // Start of Frame
+                    let mut segment = [0u8; 8];
+                    cursor.read_exact(&mut segment)?;
+
+                    let precision = segment[0];
+                    height = u16::from_be_bytes([segment[1], segment[2]]) as usize;
+                    width = u16::from_be_bytes([segment[3], segment[4]]) as usize;
+                    components = segment[5] as usize;
+
+                    if precision != 8 {
+                        return Err(BellandeError::ImageError(
+                            "Only 8-bit precision supported".to_string(),
+                        ));
+                    }
+
+                    let mut comp_info = vec![0u8; components * 3];
+                    cursor.read_exact(&mut comp_info)?;
+                }
+
+                0xDB => {
+                    // Define Quantization Table
+                    let mut length_bytes = [0u8; 2];
+                    cursor.read_exact(&mut length_bytes)?;
+                    let length = u16::from_be_bytes(length_bytes) as usize - 2;
+
+                    let mut table_data = vec![0u8; length];
+                    cursor.read_exact(&mut table_data)?;
+
+                    let precision = (table_data[0] >> 4) & 0x0F;
+                    let table_id = table_data[0] & 0x0F;
+                    let table_size = if precision == 0 { 64 } else { 128 };
+
+                    quantization_tables.insert(table_id, table_data[1..=table_size].to_vec());
+                }
+
+                0xC4 => {
+                    // Define Huffman Table
+                    let mut length_bytes = [0u8; 2];
+                    cursor.read_exact(&mut length_bytes)?;
+                    let length = u16::from_be_bytes(length_bytes) as usize - 2;
+
+                    let mut table_data = vec![0u8; length];
+                    cursor.read_exact(&mut table_data)?;
+
+                    let table_class = (table_data[0] >> 4) & 0x0F;
+                    let table_id = table_data[0] & 0x0F;
+
+                    let mut codes = Vec::new();
+                    let mut offset = 17;
+                    for &length in &table_data[1..17] {
+                        for _ in 0..length {
+                            codes.push(table_data[offset]);
+                            offset += 1;
+                        }
+                    }
+
+                    huffman_tables.insert((table_class, table_id), codes);
+                }
+
+                0xDA => {
+                    // Start of Scan
+                    let mut length_bytes = [0u8; 2];
+                    cursor.read_exact(&mut length_bytes)?;
+                    let length = u16::from_be_bytes(length_bytes) as usize - 2;
+
+                    let mut scan_data = vec![0u8; length];
+                    cursor.read_exact(&mut scan_data)?;
+
+                    // Process compressed data
+                    let mut pixels = vec![RGBPixel::new(0, 0, 0); width * height];
+                    let mut bit_reader = BitReader::new(&mut cursor);
+
+                    // Process MCUs (Minimum Coded Units)
+                    let mcu_width = ((width + 7) / 8) * 8;
+                    let mcu_height = ((height + 7) / 8) * 8;
+
+                    for y in (0..mcu_height).step_by(8) {
+                        for x in (0..mcu_width).step_by(8) {
+                            for component in 0..components {
+                                let component_u8 = component as u8;
+                                let qtable = &quantization_tables[&component_u8];
+                                let (dc_table, ac_table) = (
+                                    &huffman_tables[&(0u8, component_u8)],
+                                    &huffman_tables[&(1u8, component_u8)],
+                                );
+
+                                let block = Self::decode_block(
+                                    &mut bit_reader,
+                                    dc_table,
+                                    ac_table,
+                                    qtable,
+                                )?;
+
+                                if component == 0 {
+                                    for by in 0..8 {
+                                        for bx in 0..8 {
+                                            let px = x + bx;
+                                            let py = y + by;
+                                            if px < width && py < height {
+                                                let idx = py * width + px;
+                                                pixels[idx].r = block[by * 8 + bx] as u8;
+                                                pixels[idx].g = block[by * 8 + bx] as u8;
+                                                pixels[idx].b = block[by * 8 + bx] as u8;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    return Ok((pixels, width, height));
+                }
+
+                0xD9 => break, // End of Image
+
+                _ => {
+                    // Skip other markers
+                    let mut length_bytes = [0u8; 2];
+                    cursor.read_exact(&mut length_bytes)?;
+                    let length = u16::from_be_bytes(length_bytes) as usize - 2;
+                    cursor.seek(SeekFrom::Current(length as i64))?;
+                }
+            }
+        }
+
+        Err(BellandeError::ImageError(
+            "Failed to decode JPEG".to_string(),
+        ))
+    }
+
+    fn decode_block(
+        bit_reader: &mut BitReader<impl Read>,
+        dc_table: &[u8],
+        ac_table: &[u8],
+        qtable: &[u8],
+    ) -> Result<Vec<u8>, BellandeError> {
+        const BLOCK_SIZE: usize = 64;
+        let mut block = vec![0u8; BLOCK_SIZE];
+        let mut zz = [0i32; BLOCK_SIZE];
+
+        // Decode DC coefficient
+        let dc_value = Self::decode_huffman_value(bit_reader, dc_table)?;
+        if dc_value > 0 {
+            let bits = Self::receive_and_extend(bit_reader, dc_value as u8)?;
+            zz[0] = bits;
+        }
+
+        // Decode AC coefficients
+        let mut k = 1;
+        while k < BLOCK_SIZE {
+            let rs = Self::decode_huffman_value(bit_reader, ac_table)?;
+            let s = rs & 0x0F;
+            let r = rs >> 4;
+
+            if s == 0 {
+                if r == 15 {
+                    k += 16; // Skip 16 zeros
+                    continue;
+                }
+                break; // End of block
+            }
+
+            k += r as usize; // Skip zeros
+            if k >= BLOCK_SIZE {
+                return Err(BellandeError::ImageError(
+                    "Invalid AC coefficient index".to_string(),
+                ));
+            }
+
+            // Read additional bits
+            let value = Self::receive_and_extend(bit_reader, s as u8)?;
+            zz[Self::JPEG_NATURAL_ORDER[k]] = value;
+            k += 1;
+        }
+
+        // Dequantize
+        for i in 0..BLOCK_SIZE {
+            zz[i] *= qtable[i] as i32;
+        }
+
+        // Inverse DCT
+        Self::inverse_dct(&mut zz);
+
+        // Level shift and clamp values
+        for i in 0..BLOCK_SIZE {
+            let val = ((zz[i] + 128) >> 8).clamp(0, 255);
+            block[i] = val as u8;
+        }
+
+        Ok(block)
+    }
+
+    fn decode_huffman_value(
+        bit_reader: &mut BitReader<impl Read>,
+        table: &[u8],
+    ) -> Result<u8, BellandeError> {
+        let mut code = 0;
+        let mut code_len = 0;
+        let mut index = 0;
+
+        loop {
+            code = (code << 1)
+                | if bit_reader
+                    .read_bit()
+                    .map_err(|e| BellandeError::ImageError(e.to_string()))?
+                {
+                    1
+                } else {
+                    0
+                };
+            code_len += 1;
+
+            while index < table.len() && table[index] as u8 == code_len {
+                if code as u8 == table[index + 1] {
+                    return Ok(table[index + 2]);
+                }
+                index += 3;
+            }
+
+            if code_len >= 16 {
+                return Err(BellandeError::ImageError(
+                    "Invalid Huffman code".to_string(),
+                ));
+            }
+        }
+    }
+
+    fn receive_and_extend(
+        bit_reader: &mut BitReader<impl Read>,
+        nbits: u8,
+    ) -> Result<i32, BellandeError> {
+        if nbits == 0 {
+            return Ok(0);
+        }
+
+        let value = bit_reader
+            .read_bits(nbits)
+            .map_err(|e| BellandeError::ImageError(e.to_string()))? as i32;
+
+        let vt = 1 << (nbits - 1);
+        Ok(if value < vt {
+            value + (-1 << nbits) + 1
+        } else {
+            value
+        })
+    }
+
+    fn inverse_dct(block: &mut [i32; 64]) {
+        // Constants for IDCT
+        const W1: i32 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
+        const W2: i32 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
+        const W3: i32 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
+        const W5: i32 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
+        const W6: i32 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
+        const W7: i32 = 565; // 2048*sqrt(2)*cos(7*pi/16)
+
+        let mut tmp = [0i32; 64];
+
+        // Row IDCT
+        for i in 0..8 {
+            let row_offset = i * 8;
+            let x0 = block[row_offset];
+            let x1 = block[row_offset + 4];
+            let x2 = block[row_offset + 2];
+            let x3 = block[row_offset + 6];
+            let x4 = block[row_offset + 1];
+            let x5 = block[row_offset + 5];
+            let x6 = block[row_offset + 3];
+            let x7 = block[row_offset + 7];
+
+            // Stage 1
+            let x8 = W7 * (x4 + x5);
+            let x4 = x8 + (W1 - W7) * x4;
+            let x5 = x8 - (W1 + W7) * x5;
+            let x8 = W3 * (x6 + x7);
+            let x6 = x8 - (W3 - W5) * x6;
+            let x7 = x8 - (W3 + W5) * x7;
+
+            // Stage 2
+            let x8 = x0 + x1;
+            let x0 = x0 - x1;
+            let x1 = W6 * (x2 + x3);
+            let x2 = x1 - (W2 + W6) * x3;
+            let x3 = x1 + (W2 - W6) * x2;
+
+            // Stage 3
+            let x1 = x4 + x6;
+            let x4 = x4 - x6;
+            let x6 = x5 + x7;
+            let x5 = x5 - x7;
+
+            // Stage 4
+            let x7 = x8 + x3;
+            let x8_final = x8 - x3; // Renamed to avoid shadowing
+            let x3 = x0 + x2;
+            let x0 = x0 - x2;
+            let x2 = (181 * (x4 + x5) + 128) >> 8;
+            let x4 = (181 * (x4 - x5) + 128) >> 8;
+
+            // Output
+            tmp[row_offset] = (x7 + x1) >> 3;
+            tmp[row_offset + 1] = (x3 + x2) >> 3;
+            tmp[row_offset + 2] = (x0 + x4) >> 3;
+            tmp[row_offset + 3] = (x8_final + x6) >> 3;
+            tmp[row_offset + 4] = (x8_final - x6) >> 3;
+            tmp[row_offset + 5] = (x0 - x4) >> 3;
+            tmp[row_offset + 6] = (x3 - x2) >> 3;
+            tmp[row_offset + 7] = (x7 - x1) >> 3;
+        }
+
+        // Column IDCT
+        for i in 0..8 {
+            let x0 = tmp[i];
+            let x1 = tmp[i + 32];
+            let x2 = tmp[i + 16];
+            let x3 = tmp[i + 48];
+            let x4 = tmp[i + 8];
+            let x5 = tmp[i + 40];
+            let x6 = tmp[i + 24];
+            let x7 = tmp[i + 56];
+
+            // Stage 1
+            let x8 = W7 * (x4 + x5);
+            let x4 = x8 + (W1 - W7) * x4;
+            let x5 = x8 - (W1 + W7) * x5;
+            let x8 = W3 * (x6 + x7);
+            let x6 = x8 - (W3 - W5) * x6;
+            let x7 = x8 - (W3 + W5) * x7;
+
+            // Stage 2
+            let x8 = x0 + x1;
+            let x0 = x0 - x1;
+            let x1 = W6 * (x2 + x3);
+            let x2 = x1 - (W2 + W6) * x3;
+            let x3 = x1 + (W2 - W6) * x2;
+
+            // Stage 3
+            let x1 = x4 + x6;
+            let x4 = x4 - x6;
+            let x6 = x5 + x7;
+            let x5 = x5 - x7;
+
+            // Stage 4
+            let x7 = x8 + x3;
+            let x8_final = x8 - x3;
+            let x3 = x0 + x2;
+            let x0 = x0 - x2;
+            let x2 = (181 * (x4 + x5) + 128) >> 8;
+            let x4 = (181 * (x4 - x5) + 128) >> 8;
+
+            // Final output with proper scaling
+            block[i] = (x7 + x1) >> 14;
+            block[i + 8] = (x3 + x2) >> 14;
+            block[i + 16] = (x0 + x4) >> 14;
+            block[i + 24] = (x8_final + x6) >> 14;
+            block[i + 32] = (x8_final - x6) >> 14;
+            block[i + 40] = (x0 - x4) >> 14;
+            block[i + 48] = (x3 - x2) >> 14;
+            block[i + 56] = (x7 - x1) >> 14;
+        }
+    }
+
+    /// Decodes PNG image bytes
+    fn decode_png(bytes: &[u8]) -> Result<(Vec<RGBPixel>, usize, usize), BellandeError> {
+        let mut cursor = Cursor::new(bytes);
+
+        // Verify PNG signature
+        let mut signature = [0u8; 8];
+        cursor.read_exact(&mut signature).map_err(|e| {
+            BellandeError::ImageError(format!("Failed to read PNG signature: {}", e))
+        })?;
+
+        if signature != [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A] {
+            return Err(BellandeError::ImageError(
+                "Invalid PNG signature".to_string(),
+            ));
+        }
+
+        let mut width = 0;
+        let mut height = 0;
+        let mut image_data = Vec::new();
+        let mut palette = Vec::new();
+        let mut bit_depth = 0;
+        let mut color_type = 0;
+
+        loop {
+            let length = cursor.read_u32::<BigEndian>().map_err(|e| {
+                BellandeError::ImageError(format!("Failed to read chunk length: {}", e))
+            })? as usize;
+
+            let mut chunk_type = [0u8; 4];
+            cursor.read_exact(&mut chunk_type).map_err(|e| {
+                BellandeError::ImageError(format!("Failed to read chunk type: {}", e))
+            })?;
+
+            match &chunk_type {
+                b"IHDR" => {
+                    width = cursor.read_u32::<BigEndian>().map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read width: {}", e))
+                    })? as usize;
+                    height = cursor.read_u32::<BigEndian>().map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read height: {}", e))
+                    })? as usize;
+
+                    let mut ihdr_data = [0u8; 5];
+                    cursor.read_exact(&mut ihdr_data).map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read IHDR data: {}", e))
+                    })?;
+
+                    bit_depth = ihdr_data[0];
+                    color_type = ihdr_data[1];
+
+                    cursor.seek(SeekFrom::Current(4))?; // Skip CRC
+                }
+
+                b"PLTE" => {
+                    palette = vec![0u8; length];
+                    cursor.read_exact(&mut palette).map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read palette: {}", e))
+                    })?;
+                    cursor.seek(SeekFrom::Current(4))?; // Skip CRC
+                }
+
+                b"IDAT" => {
+                    let mut chunk_data = vec![0u8; length];
+                    cursor.read_exact(&mut chunk_data).map_err(|e| {
+                        BellandeError::ImageError(format!("Failed to read IDAT chunk: {}", e))
+                    })?;
+                    image_data.extend(chunk_data);
+                    cursor.seek(SeekFrom::Current(4))?; // Skip CRC
+                }
+
+                b"IEND" => break,
+
+                _ => {
+                    cursor
+                        .seek(SeekFrom::Current((length + 4) as i64))
+                        .map_err(|e| {
+                            BellandeError::ImageError(format!("Failed to skip chunk: {}", e))
+                        })?;
+                }
+            }
+        }
+
+        // Process image data based on color type
+        let mut decoder = Decoder::new(&image_data[..]);
+        let mut decoded_data = Vec::new();
+        decoder.read_to_end(&mut decoded_data)?;
+
+        let pixels = match color_type {
+            2 => {
+                // RGB
+                let bpp = 3;
+                let stride = width * bpp + 1;
+                let mut pixels = Vec::with_capacity(width * height);
+
+                for y in 0..height {
+                    let row_start = y * stride + 1; // Skip filter byte
+                    for x in 0..width {
+                        let i = row_start + x * bpp;
+                        pixels.push(RGBPixel::new(
+                            decoded_data[i],
+                            decoded_data[i + 1],
+                            decoded_data[i + 2],
+                        ));
+                    }
+                }
+                pixels
+            }
+
+            3 => {
+                // Palette
+                if palette.is_empty() {
+                    return Err(BellandeError::ImageError(
+                        "Missing palette data".to_string(),
+                    ));
+                }
+
+                let stride = width + 1;
+                let mut pixels = Vec::with_capacity(width * height);
+
+                for y in 0..height {
+                    let row_start = y * stride + 1; // Skip filter byte
+                    for x in 0..width {
+                        let index = (decoded_data[row_start + x] as usize) * 3;
+                        pixels.push(RGBPixel::new(
+                            palette[index],
+                            palette[index + 1],
+                            palette[index + 2],
+                        ));
+                    }
+                }
+                pixels
+            }
+
+            6 => {
+                // RGBA
+                let bpp = 4;
+                let stride = width * bpp + 1;
+                let mut pixels = Vec::with_capacity(width * height);
+
+                for y in 0..height {
+                    let row_start = y * stride + 1; // Skip filter byte
+                    for x in 0..width {
+                        let i = row_start + x * bpp;
+                        pixels.push(RGBPixel::new(
+                            decoded_data[i],
+                            decoded_data[i + 1],
+                            decoded_data[i + 2],
+                        ));
+                    }
+                }
+                pixels
+            }
+
+            _ => {
+                return Err(BellandeError::ImageError(format!(
+                    "Unsupported color type: {}",
+                    color_type
+                )))
+            }
+        };
+
+        Ok((pixels, width, height))
+    }
+
+    /// Converts RGB pixels to tensor
+    fn rgb_to_tensor(
+        pixels: &[RGBPixel],
+        width: usize,
+        height: usize,
+    ) -> Result<Tensor, BellandeError> {
+        if pixels.len() != width * height {
+            return Err(BellandeError::ImageError(format!(
+                "Invalid pixel buffer size: expected {}, got {}",
+                width * height,
+                pixels.len()
+            )));
+        }
+
+        let mut data = Vec::with_capacity(3 * width * height);
+
+        // Convert to CHW format and normalize to [0, 1]
+        for channel in 0..3 {
+            data.extend(pixels.iter().map(|pixel| {
+                let value = match channel {
+                    0 => pixel.r,
+                    1 => pixel.g,
+                    2 => pixel.b,
+                    _ => unreachable!(),
+                };
+                f32::from(value) / 255.0
+            }));
+        }
+
+        Ok(Tensor::new(
+            data,
+            vec![1, 3, height, width],
+            false,
+            Device::CPU,
+            DataType::Float32,
+        ))
+    }
+
+    /// Gets a cached tensor or loads it from disk
+    fn get_cached_tensor(&self, path: &PathBuf) -> Result<Arc<Tensor>, BellandeError> {
+        if let Some(cache_lock) = &self.cache {
+            // Try to read from cache first
+            if let Ok(cache) = cache_lock.read() {
+                if let Some(tensor) = cache.get(path) {
+                    return Ok(Arc::clone(tensor));
+                }
+            }
+
+            // Not in cache, load it
+            let bytes = Self::read_image_file(path)?;
+            let (pixels, width, height) = Self::decode_image_to_rgb(&bytes)?;
+            let tensor = Arc::new(Self::rgb_to_tensor(&pixels, width, height)?);
+
+            // Update cache
+            if let Ok(mut cache) = cache_lock.write() {
+                // Manage cache size
+                if cache.len() >= self.cache_size {
+                    if let Some(key) = cache.keys().next().cloned() {
+                        cache.remove(&key);
+                    }
+                }
+                cache.insert(path.clone(), Arc::clone(&tensor));
+            }
+
+            Ok(tensor)
+        } else {
+            // Cache disabled, just load and return
+            let bytes = Self::read_image_file(path)?;
+            let (pixels, width, height) = Self::decode_image_to_rgb(&bytes)?;
+            Ok(Arc::new(Self::rgb_to_tensor(&pixels, width, height)?))
+        }
+    }
+
+    pub fn num_classes(&self) -> usize {
+        self.class_to_idx.len()
+    }
+
+    pub fn get_class_to_idx(&self) -> &HashMap<String, usize> {
+        &self.class_to_idx
+    }
+
+    pub fn get_sample_path(&self, index: usize) -> Option<&PathBuf> {
+        self.samples.get(index).map(|(path, _)| path)
+    }
+
+    pub fn set_caching(&mut self, enabled: bool) {
+        self.cache = if enabled {
+            Some(RwLock::new(HashMap::new()))
+        } else {
+            None
+        };
+    }
+
+    pub fn clear_cache(&self) {
+        if let Some(cache_lock) = &self.cache {
+            if let Ok(mut cache) = cache_lock.write() {
+                cache.clear();
+            }
+        }
+    }
+}
+
+impl Dataset for ImageFolder {
+    fn len(&self) -> usize {
+        self.samples.len()
+    }
+
+    fn num_classes(&self) -> usize {
+        self.num_classes()
+    }
+
+    fn get(&self, index: usize) -> Result<(Tensor, Tensor), BellandeError> {
+        let (path, class_idx) = &self.samples[index];
+        let input = self.get_cached_tensor(path)?;
+
+        let target = Tensor::new(
+            vec![*class_idx as f32],
+            vec![1],
+            false,
+            input.get_device().clone(),
+            input.get_dtype().clone(),
+        );
+
+        let mut final_input = (*input).clone();
+        if let Some(transform) = &self.transform {
+            final_input = transform.apply(&final_input)?;
+        }
+
+        let mut final_target = target;
+        if let Some(target_transform) = &self.target_transform {
+            final_target = target_transform.apply(&final_target)?;
+        }
+
+        Ok((final_input, final_target))
+    }
+}
diff --git a/src/data/image_transformation_augmentation.rs b/src/data/image_transformation_augmentation.rs
new file mode 100644
index 0000000..9b93b7e
--- /dev/null
+++ b/src/data/image_transformation_augmentation.rs
@@ -0,0 +1,315 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use rand::{thread_rng, Rng};
+
+/// Trait for image transformations
+pub trait Transform: Send + Sync {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError>;
+    fn name(&self) -> &str;
+}
+
+/// Center crop transformation
+pub struct CenterCrop {
+    height: usize,
+    width: usize,
+}
+
+impl CenterCrop {
+    pub fn new(height: usize, width: usize) -> Self {
+        Self { height, width }
+    }
+}
+
+impl Transform for CenterCrop {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        let shape = tensor.shape();
+        if shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor".to_string(),
+            ));
+        }
+
+        let [batch_size, channels, in_height, in_width] = shape[..4] else {
+            return Err(BellandeError::InvalidShape(
+                "Invalid tensor shape".to_string(),
+            ));
+        };
+
+        if in_height < self.height || in_width < self.width {
+            return Err(BellandeError::InvalidOperation(
+                "Crop size larger than input size".into(),
+            ));
+        }
+
+        let start_h = (in_height - self.height) / 2;
+        let start_w = (in_width - self.width) / 2;
+        let mut cropped = vec![0.0; batch_size * channels * self.height * self.width];
+
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..self.height {
+                    for w in 0..self.width {
+                        let src_idx = ((b * channels + c) * in_height + (start_h + h)) * in_width
+                            + (start_w + w);
+                        let dst_idx = ((b * channels + c) * self.height + h) * self.width + w;
+                        cropped[dst_idx] = tensor.data()[src_idx];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            cropped,
+            vec![batch_size, channels, self.height, self.width],
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+
+    fn name(&self) -> &str {
+        "CenterCrop"
+    }
+}
+
+/// Random crop transformation
+pub struct RandomCrop {
+    height: usize,
+    width: usize,
+}
+
+impl RandomCrop {
+    pub fn new(height: usize, width: usize) -> Self {
+        Self { height, width }
+    }
+}
+
+impl Transform for RandomCrop {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        let shape = tensor.shape();
+        if shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor".to_string(),
+            ));
+        }
+
+        let [batch_size, channels, in_height, in_width] = shape[..4] else {
+            return Err(BellandeError::InvalidShape(
+                "Invalid tensor shape".to_string(),
+            ));
+        };
+
+        if in_height < self.height || in_width < self.width {
+            return Err(BellandeError::InvalidOperation(
+                "Crop size larger than input size".into(),
+            ));
+        }
+
+        let mut rng = thread_rng();
+        let start_h = rng.gen_range(0..=in_height - self.height);
+        let start_w = rng.gen_range(0..=in_width - self.width);
+        let mut cropped = vec![0.0; batch_size * channels * self.height * self.width];
+
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..self.height {
+                    for w in 0..self.width {
+                        let src_idx = ((b * channels + c) * in_height + (start_h + h)) * in_width
+                            + (start_w + w);
+                        let dst_idx = ((b * channels + c) * self.height + h) * self.width + w;
+                        cropped[dst_idx] = tensor.data()[src_idx];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            cropped,
+            vec![batch_size, channels, self.height, self.width],
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+
+    fn name(&self) -> &str {
+        "RandomCrop"
+    }
+}
+
+pub struct RandomVerticalFlip {
+    probability: f32,
+}
+
+impl RandomVerticalFlip {
+    pub fn new(probability: f32) -> Self {
+        Self { probability }
+    }
+}
+
+impl Transform for RandomVerticalFlip {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        if thread_rng().gen::<f32>() > self.probability {
+            return Ok(tensor.clone());
+        }
+
+        let shape = tensor.shape();
+        if shape.len() != 4 {
+            return Err(BellandeError::InvalidShape("Expected 4D tensor".into()));
+        }
+
+        let [batch_size, channels, height, width] = shape[..4] else {
+            return Err(BellandeError::InvalidShape("Invalid tensor shape".into()));
+        };
+
+        let mut flipped = vec![0.0; tensor.data.len()];
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..height {
+                    for w in 0..width {
+                        let src_idx = ((b * channels + c) * height + h) * width + w;
+                        let dst_idx = ((b * channels + c) * height + (height - 1 - h)) * width + w;
+                        flipped[dst_idx] = tensor.data[src_idx];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            flipped,
+            shape.to_vec(),
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+
+    fn name(&self) -> &str {
+        "RandomVerticalFlip"
+    }
+}
+
+pub struct ColorJitter {
+    brightness: f32,
+    contrast: f32,
+    saturation: f32,
+}
+
+impl ColorJitter {
+    pub fn new(brightness: f32, contrast: f32, saturation: f32) -> Self {
+        Self {
+            brightness,
+            contrast,
+            saturation,
+        }
+    }
+
+    fn adjust_brightness(&self, data: &mut [f32]) {
+        let factor = 1.0 + thread_rng().gen_range(-self.brightness..=self.brightness);
+        for value in data.iter_mut() {
+            *value = (*value * factor).max(0.0).min(1.0);
+        }
+    }
+
+    fn adjust_contrast(&self, data: &mut [f32]) {
+        let factor = 1.0 + thread_rng().gen_range(-self.contrast..=self.contrast);
+        let mean = data.iter().sum::<f32>() / data.len() as f32;
+        for value in data.iter_mut() {
+            *value = ((*value - mean) * factor + mean).max(0.0).min(1.0);
+        }
+    }
+
+    fn adjust_saturation(&self, data: &mut [f32], shape: &[usize]) {
+        if shape[1] != 3 {
+            return;
+        }
+
+        let factor = 1.0 + thread_rng().gen_range(-self.saturation..=self.saturation);
+        let size = shape[0] * shape[2] * shape[3];
+
+        for i in 0..size {
+            let r = data[i];
+            let g = data[i + size];
+            let b = data[i + size * 2];
+            let gray = 0.2989 * r + 0.5870 * g + 0.1140 * b;
+
+            data[i] = ((r - gray) * factor + gray).max(0.0).min(1.0);
+            data[i + size] = ((g - gray) * factor + gray).max(0.0).min(1.0);
+            data[i + size * 2] = ((b - gray) * factor + gray).max(0.0).min(1.0);
+        }
+    }
+}
+
+impl Transform for ColorJitter {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        let shape = tensor.shape().to_vec();
+        let mut data = tensor.data().to_vec();
+
+        self.adjust_brightness(&mut data);
+        self.adjust_contrast(&mut data);
+        self.adjust_saturation(&mut data, &shape);
+
+        Ok(Tensor::new(
+            data,
+            shape,
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+
+    fn name(&self) -> &str {
+        "ColorJitter"
+    }
+}
+
+/// Gaussian noise transformation
+pub struct GaussianNoise {
+    mean: f32,
+    std: f32,
+}
+
+impl GaussianNoise {
+    pub fn new(mean: f32, std: f32) -> Self {
+        Self { mean, std }
+    }
+}
+
+impl Transform for GaussianNoise {
+    fn apply(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        let mut rng = thread_rng();
+        let mut noisy = tensor.data.to_vec();
+        let shape = tensor.shape().to_vec();
+
+        for value in noisy.iter_mut() {
+            let noise = rng.gen_range(-2.0..=2.0) * self.std + self.mean;
+            *value = (*value + noise).max(0.0).min(1.0);
+        }
+
+        Ok(Tensor::new(
+            noisy,
+            shape,
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+
+    fn name(&self) -> &str {
+        "GaussianNoise"
+    }
+}
diff --git a/src/data/mod.rs b/src/data/mod.rs
new file mode 100644
index 0000000..ca42788
--- /dev/null
+++ b/src/data/mod.rs
@@ -0,0 +1,8 @@
+pub mod augmentation;
+pub mod dataloader;
+pub mod dataset;
+pub mod image_decoder;
+pub mod image_folder;
+pub mod image_transformation_augmentation;
+pub mod preprocessing;
+pub mod sampler;
diff --git a/src/data/preprocessing.rs b/src/data/preprocessing.rs
new file mode 100644
index 0000000..d4923bd
--- /dev/null
+++ b/src/data/preprocessing.rs
@@ -0,0 +1,69 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+pub trait Preprocessor: Send + Sync {
+    fn process(&self, tensor: &Tensor) -> Result<Tensor, BellandeError>;
+}
+
+pub struct Normalize {
+    mean: Vec<f32>,
+    std: Vec<f32>,
+}
+
+impl Normalize {
+    pub fn new(mean: Vec<f32>, std: Vec<f32>) -> Self {
+        assert_eq!(mean.len(), std.len());
+        Normalize { mean, std }
+    }
+}
+
+impl Preprocessor for Normalize {
+    fn process(&self, tensor: &Tensor) -> Result<Tensor, BellandeError> {
+        if tensor.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(format!("Preprocessor Invalid")))?;
+        }
+
+        let (batch_size, channels, height, width) = (
+            tensor.shape[0],
+            tensor.shape[1],
+            tensor.shape[2],
+            tensor.shape[3],
+        );
+
+        assert_eq!(channels, self.mean.len());
+
+        let mut normalized = tensor.data.clone();
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..height {
+                    for w in 0..width {
+                        let idx = ((b * channels + c) * height + h) * width + w;
+                        normalized[idx] = (normalized[idx] - self.mean[c]) / self.std[c];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            normalized,
+            tensor.shape.clone(),
+            tensor.requires_grad,
+            tensor.device.clone(),
+            tensor.dtype,
+        ))
+    }
+}
diff --git a/src/data/sampler.rs b/src/data/sampler.rs
new file mode 100644
index 0000000..8a3fef1
--- /dev/null
+++ b/src/data/sampler.rs
@@ -0,0 +1,91 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use rand::seq::SliceRandom;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+pub trait Sampler: Send + Sync {
+    fn sample(&mut self, n: usize) -> Vec<usize>;
+    fn len(&self) -> usize;
+}
+
+pub struct RandomSampler {
+    data_len: usize,
+    current_index: AtomicUsize,
+    indices: Vec<usize>,
+}
+
+impl RandomSampler {
+    pub fn new(data_len: usize) -> Self {
+        let mut indices: Vec<usize> = (0..data_len).collect();
+        let mut rng = rand::thread_rng();
+        indices.shuffle(&mut rng);
+        RandomSampler {
+            data_len,
+            current_index: AtomicUsize::new(0),
+            indices,
+        }
+    }
+}
+
+impl Sampler for RandomSampler {
+    fn sample(&mut self, n: usize) -> Vec<usize> {
+        let current = self.current_index.fetch_add(n, Ordering::SeqCst);
+        if current >= self.data_len {
+            let mut indices: Vec<usize> = (0..self.data_len).collect();
+            let mut rng = rand::thread_rng();
+            indices.shuffle(&mut rng);
+            self.indices = indices;
+            self.current_index.store(n, Ordering::SeqCst);
+            self.indices[0..n].to_vec()
+        } else {
+            self.indices[current..current + n.min(self.data_len - current)].to_vec()
+        }
+    }
+
+    fn len(&self) -> usize {
+        self.data_len
+    }
+}
+
+pub struct SequentialSampler {
+    data_len: usize,
+    current_index: AtomicUsize,
+}
+
+impl SequentialSampler {
+    pub fn new(data_len: usize) -> Self {
+        SequentialSampler {
+            data_len,
+            current_index: AtomicUsize::new(0),
+        }
+    }
+}
+
+impl Sampler for SequentialSampler {
+    fn sample(&mut self, n: usize) -> Vec<usize> {
+        let current = self.current_index.fetch_add(n, Ordering::SeqCst);
+        if current >= self.data_len {
+            self.current_index.store(n, Ordering::SeqCst);
+            (0..n.min(self.data_len)).collect()
+        } else {
+            (current..current + n.min(self.data_len - current)).collect()
+        }
+    }
+
+    fn len(&self) -> usize {
+        self.data_len
+    }
+}
diff --git a/src/distributed/distributed.rs b/src/distributed/distributed.rs
new file mode 100644
index 0000000..5f91557
--- /dev/null
+++ b/src/distributed/distributed.rs
@@ -0,0 +1,83 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::sync::Arc;
+use tokio::sync::Mutex;
+use tokio::task;
+
+pub struct DistributedTrainer {
+    world_size: usize,
+    rank: usize,
+    model: Arc<Mutex<Box<dyn Model>>>,
+    optimizer: Arc<Mutex<Box<dyn Optimizer>>>,
+}
+
+impl DistributedTrainer {
+    pub fn new(
+        model: Box<dyn Model>,
+        optimizer: Box<dyn Optimizer>,
+        world_size: usize,
+        rank: usize,
+    ) -> Self {
+        DistributedTrainer {
+            world_size,
+            rank,
+            model: Arc::new(Mutex::new(model)),
+            optimizer: Arc::new(Mutex::new(optimizer)),
+        }
+    }
+
+    pub async fn average_gradients(&self) {
+        let model = self.model.lock().await;
+        for param in model.parameters() {
+            if let Some(grad) = &param.grad {
+                // Simulate gradient averaging across processes
+                let averaged_grad: Vec<f32> = grad.iter()
+                    .map(|&g| g / self.world_size as f32)
+                    .collect();
+                param.grad = Some(averaged_grad);
+            }
+        }
+    }
+
+    pub async fn train_step(&self, batch: (Tensor, Tensor)) -> f32 {
+        let (batch_x, batch_y) = batch;
+        let loss;
+
+        {
+            let mut model = self.model.lock().await;
+            let mut optimizer = self.optimizer.lock().await;
+
+            optimizer.zero_grad();
+
+            // Forward pass
+            let prediction = model.forward(&batch_x);
+            loss = self.loss_fn.forward(&prediction, &batch_y);
+
+            // Backward pass
+            let grad = self.loss_fn.backward(&prediction, &batch_y);
+            model.backward(&grad);
+        }
+
+        // Average gradients across all processes
+        self.average_gradients().await;
+
+        // Update parameters
+        let mut optimizer = self.optimizer.lock().await;
+        optimizer.step();
+
+        loss
+    }
+}
diff --git a/src/distributed/mod.rs b/src/distributed/mod.rs
new file mode 100644
index 0000000..aa9edb5
--- /dev/null
+++ b/src/distributed/mod.rs
@@ -0,0 +1 @@
+pub mod distributed;
diff --git a/src/layer/activation.rs b/src/layer/activation.rs
new file mode 100644
index 0000000..05054bc
--- /dev/null
+++ b/src/layer/activation.rs
@@ -0,0 +1,118 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+pub trait Activation {
+    fn forward(&self, input: &Tensor) -> Result<Tensor, BellandeError>;
+    fn backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError>;
+}
+
+pub struct ReLU {
+    mask: Option<Vec<bool>>,
+}
+
+impl ReLU {
+    pub fn new() -> Self {
+        ReLU { mask: None }
+    }
+}
+
+impl Activation for ReLU {
+    fn forward(&self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let mut output = input.data.clone();
+        let mask: Vec<bool> = output
+            .iter_mut()
+            .map(|x| {
+                if *x < 0.0 {
+                    *x = 0.0;
+                    false
+                } else {
+                    true
+                }
+            })
+            .collect();
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        if let Some(ref mask) = self.mask {
+            let grad = grad_output
+                .data
+                .iter()
+                .zip(mask.iter())
+                .map(|(&g, &m)| if m { g } else { 0.0 })
+                .collect();
+
+            Ok(Tensor::new(
+                grad,
+                grad_output.shape.clone(),
+                true,
+                grad_output.device.clone(),
+                grad_output.dtype,
+            ))
+        } else {
+            Err(BellandeError::RuntimeError(
+                "Forward pass not called".into(),
+            ))
+        }
+    }
+}
+
+pub struct Sigmoid;
+
+impl Activation for Sigmoid {
+    fn forward(&self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = input
+            .data
+            .iter()
+            .map(|&x| 1.0 / (1.0 + (-x).exp()))
+            .collect();
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let grad = grad_output
+            .data
+            .iter()
+            .map(|&x| {
+                let s = 1.0 / (1.0 + (-x).exp());
+                s * (1.0 - s)
+            })
+            .collect();
+
+        Ok(Tensor::new(
+            grad,
+            grad_output.shape.clone(),
+            true,
+            grad_output.device.clone(),
+            grad_output.dtype,
+        ))
+    }
+}
diff --git a/src/layer/avgpool2d.rs b/src/layer/avgpool2d.rs
new file mode 100644
index 0000000..3f6730e
--- /dev/null
+++ b/src/layer/avgpool2d.rs
@@ -0,0 +1,205 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+
+pub struct AvgPool2d {
+    kernel_size: (usize, usize),
+    stride: (usize, usize),
+    padding: (usize, usize),
+    pub(crate) input: Option<Tensor>,
+    training: bool,
+}
+
+impl AvgPool2d {
+    pub fn new(
+        kernel_size: (usize, usize),
+        stride: Option<(usize, usize)>,
+        padding: Option<(usize, usize)>,
+    ) -> Self {
+        let stride = stride.unwrap_or(kernel_size);
+        let padding = padding.unwrap_or((0, 0));
+
+        AvgPool2d {
+            kernel_size,
+            stride,
+            padding,
+            input: None,
+            training: true,
+        }
+    }
+
+    // Internal forward implementation
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor (batch_size, channels, height, width)".into(),
+            ));
+        }
+
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        let output_height = (height + 2 * self.padding.0 - self.kernel_size.0) / self.stride.0 + 1;
+        let output_width = (width + 2 * self.padding.1 - self.kernel_size.1) / self.stride.1 + 1;
+
+        let mut output = vec![0.0; batch_size * channels * output_height * output_width];
+
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for oh in 0..output_height {
+                    for ow in 0..output_width {
+                        let mut sum = 0.0;
+                        let mut count = 0.0;
+
+                        for kh in 0..self.kernel_size.0 {
+                            for kw in 0..self.kernel_size.1 {
+                                let h = oh as isize * self.stride.0 as isize + kh as isize
+                                    - self.padding.0 as isize;
+                                let w = ow as isize * self.stride.1 as isize + kw as isize
+                                    - self.padding.1 as isize;
+
+                                if h >= 0 && h < height as isize && w >= 0 && w < width as isize {
+                                    let input_idx = ((b * channels + c) * height + h as usize)
+                                        * width
+                                        + w as usize;
+                                    sum += input.data[input_idx];
+                                    count += 1.0;
+                                }
+                            }
+                        }
+
+                        let output_idx =
+                            ((b * channels + c) * output_height + oh) * output_width + ow;
+                        output[output_idx] = if count > 0.0 { sum / count } else { 0.0 };
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            vec![batch_size, channels, output_height, output_width],
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward_input(
+        &self,
+        grad_output: &Tensor,
+        input: &Tensor,
+    ) -> Result<Tensor, BellandeError> {
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        let mut grad_input = vec![0.0; input.data.len()];
+        let kernel_size = (self.kernel_size.0 * self.kernel_size.1) as f32;
+
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..height {
+                    for w in 0..width {
+                        let mut grad = 0.0;
+
+                        let oh_start = (h.saturating_sub(self.kernel_size.0 - 1) + self.stride.0
+                            - 1)
+                            / self.stride.0;
+                        let ow_start = (w.saturating_sub(self.kernel_size.1 - 1) + self.stride.1
+                            - 1)
+                            / self.stride.1;
+
+                        let oh_end = (h + self.padding.0) / self.stride.0;
+                        let ow_end = (w + self.padding.1) / self.stride.1;
+
+                        for oh in oh_start..=oh_end {
+                            for ow in ow_start..=ow_end {
+                                if oh < grad_output.shape[2] && ow < grad_output.shape[3] {
+                                    let output_idx = ((b * channels + c) * grad_output.shape[2]
+                                        + oh)
+                                        * grad_output.shape[3]
+                                        + ow;
+                                    grad += grad_output.data[output_idx] / kernel_size;
+                                }
+                            }
+                        }
+
+                        let input_idx = ((b * channels + c) * height + h) * width + w;
+                        grad_input[input_idx] = grad;
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            grad_input,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+}
+
+impl NeuralLayer for AvgPool2d {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+        self.backward_input(grad_output, input)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        Vec::new() // AvgPool2d has no learnable parameters
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        Vec::new() // AvgPool2d has no learnable parameters
+    }
+
+    fn set_parameter(&mut self, _name: &str, _value: Tensor) -> Result<(), BellandeError> {
+        Err(BellandeError::InvalidParameter(
+            "AvgPool2d has no learnable parameters".to_string(),
+        ))
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for AvgPool2d {}
+unsafe impl Sync for AvgPool2d {}
diff --git a/src/layer/batch_norm.rs b/src/layer/batch_norm.rs
new file mode 100644
index 0000000..64ff722
--- /dev/null
+++ b/src/layer/batch_norm.rs
@@ -0,0 +1,739 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+use std::sync::Arc;
+
+pub struct BatchNorm1d {
+    num_features: usize,
+    eps: f32,
+    momentum: f32,
+    pub(crate) running_mean: Arc<Tensor>,
+    pub(crate) running_var: Arc<Tensor>,
+    pub(crate) weight: Option<Tensor>,
+    pub(crate) bias: Option<Tensor>,
+    pub(crate) training: bool,
+    input: Option<Tensor>,
+}
+
+pub struct BatchNorm2d {
+    num_features: usize,
+    eps: f32,
+    momentum: f32,
+    pub(crate) running_mean: Arc<Tensor>,
+    pub(crate) running_var: Arc<Tensor>,
+    pub(crate) weight: Option<Tensor>,
+    pub(crate) bias: Option<Tensor>,
+    pub(crate) training: bool,
+    input: Option<Tensor>,
+}
+
+impl BatchNorm1d {
+    pub fn new(num_features: usize, eps: f32, momentum: f32, affine: bool) -> Self {
+        let running_mean = Arc::new(Tensor::zeros(&[num_features]));
+        let running_var = Arc::new(Tensor::ones(&[num_features]));
+
+        BatchNorm1d {
+            num_features,
+            eps,
+            momentum,
+            running_mean,
+            running_var,
+            weight: if affine {
+                Some(Tensor::ones(&[num_features]))
+            } else {
+                None
+            },
+            bias: if affine {
+                Some(Tensor::zeros(&[num_features]))
+            } else {
+                None
+            },
+            training: true,
+            input: None,
+        }
+    }
+
+    fn update_running_stats(&mut self, mean: &[f32], var: &[f32]) -> Result<(), BellandeError> {
+        let running_mean = Arc::get_mut(&mut self.running_mean).ok_or_else(|| {
+            BellandeError::RuntimeError("Failed to get mutable reference to running mean".into())
+        })?;
+
+        let running_var = Arc::get_mut(&mut self.running_var).ok_or_else(|| {
+            BellandeError::RuntimeError(
+                "Failed to get mutable reference to running variance".into(),
+            )
+        })?;
+
+        for i in 0..self.num_features {
+            running_mean.data[i] =
+                self.momentum * running_mean.data[i] + (1.0 - self.momentum) * mean[i];
+            running_var.data[i] =
+                self.momentum * running_var.data[i] + (1.0 - self.momentum) * var[i];
+        }
+
+        Ok(())
+    }
+
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 2 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 2D tensor (batch_size, num_features)".into(),
+            ));
+        }
+
+        let (batch_size, features) = (input.shape[0], input.shape[1]);
+
+        if features != self.num_features {
+            return Err(BellandeError::InvalidOperation(format!(
+                "Expected {} features but got {}",
+                self.num_features, features
+            )));
+        }
+
+        let mut output = input.data.clone();
+
+        if self.training {
+            let mut mean = vec![0.0; features];
+            let mut var = vec![0.0; features];
+
+            // Calculate mean and variance
+            for f in 0..features {
+                let mut sum = 0.0;
+                let mut sq_sum = 0.0;
+
+                for b in 0..batch_size {
+                    let idx = b * features + f;
+                    let val = input.data[idx];
+                    sum += val;
+                    sq_sum += val * val;
+                }
+
+                mean[f] = sum / batch_size as f32;
+                var[f] = sq_sum / batch_size as f32 - mean[f] * mean[f];
+            }
+
+            // Update running statistics
+            self.update_running_stats(&mean, &var)?;
+
+            // Normalize
+            for f in 0..features {
+                let std = (var[f] + self.eps).sqrt();
+                for b in 0..batch_size {
+                    let idx = b * features + f;
+                    output[idx] = (output[idx] - mean[f]) / std;
+
+                    if let Some(ref weight) = self.weight {
+                        output[idx] *= weight.data[f];
+                    }
+                    if let Some(ref bias) = self.bias {
+                        output[idx] += bias.data[f];
+                    }
+                }
+            }
+        } else {
+            // Use running statistics for inference
+            let running_mean = &self.running_mean;
+            let running_var = &self.running_var;
+
+            for f in 0..features {
+                let std = (running_var.data[f] + self.eps).sqrt();
+                for b in 0..batch_size {
+                    let idx = b * features + f;
+                    output[idx] = (output[idx] - running_mean.data[f]) / std;
+
+                    if let Some(ref weight) = self.weight {
+                        output[idx] *= weight.data[f];
+                    }
+                    if let Some(ref bias) = self.bias {
+                        output[idx] += bias.data[f];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn backward_impl(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+
+        let (batch_size, num_features) = (input.shape[0], input.shape[1]);
+        let n = batch_size as f32;
+
+        // Calculate mean and variance
+        let mut mean = vec![0.0; num_features];
+        let mut var = vec![0.0; num_features];
+
+        for f in 0..num_features {
+            let mut sum = 0.0;
+            let mut sq_sum = 0.0;
+            for b in 0..batch_size {
+                let idx = b * num_features + f;
+                let val = input.data[idx];
+                sum += val;
+                sq_sum += val * val;
+            }
+            mean[f] = sum / n;
+            var[f] = sq_sum / n - mean[f] * mean[f];
+        }
+
+        // Initialize gradients
+        let mut dx = vec![0.0; input.data.len()];
+        let mut dweight = if self.weight.is_some() {
+            vec![0.0; num_features]
+        } else {
+            vec![]
+        };
+        let mut dbias = if self.bias.is_some() {
+            vec![0.0; num_features]
+        } else {
+            vec![]
+        };
+
+        // Compute gradients
+        for f in 0..num_features {
+            let std = (var[f] + self.eps).sqrt();
+            let inv_std = 1.0 / std;
+
+            let mut dxhat = vec![0.0; batch_size];
+            let mut sum_dxhat = 0.0;
+            let mut sum_dxhat_x = 0.0;
+
+            // Compute dxhat and accumulate sums
+            for b in 0..batch_size {
+                let idx = b * num_features + f;
+                let xhat = (input.data[idx] - mean[f]) * inv_std;
+
+                dxhat[b] = grad_output.data[idx];
+                if let Some(ref weight) = self.weight {
+                    dxhat[b] *= weight.data[f];
+                }
+
+                sum_dxhat += dxhat[b];
+                sum_dxhat_x += dxhat[b] * xhat;
+            }
+
+            // Compute dx
+            for b in 0..batch_size {
+                let idx = b * num_features + f;
+                let xhat = (input.data[idx] - mean[f]) * inv_std;
+
+                dx[idx] = inv_std * (dxhat[b] - sum_dxhat / n - xhat * sum_dxhat_x / n);
+            }
+
+            // Compute dweight and dbias if they exist
+            if let Some(_) = self.weight {
+                dweight[f] = 0.0;
+                for b in 0..batch_size {
+                    let idx = b * num_features + f;
+                    let xhat = (input.data[idx] - mean[f]) * inv_std;
+                    dweight[f] += grad_output.data[idx] * xhat;
+                }
+            }
+
+            if let Some(_) = self.bias {
+                dbias[f] = 0.0;
+                for b in 0..batch_size {
+                    let idx = b * num_features + f;
+                    dbias[f] += grad_output.data[idx];
+                }
+            }
+        }
+
+        // Update weight and bias gradients if they exist
+        if let Some(ref mut weight) = self.weight {
+            weight.grad = Some(dweight);
+        }
+
+        if let Some(ref mut bias) = self.bias {
+            bias.grad = Some(dbias);
+        }
+
+        Ok(Tensor::new(
+            dx,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+}
+
+impl BatchNorm2d {
+    pub fn new(num_features: usize, eps: f32, momentum: f32, affine: bool) -> Self {
+        let running_mean = Arc::new(Tensor::zeros(&[num_features]));
+        let running_var = Arc::new(Tensor::ones(&[num_features]));
+
+        BatchNorm2d {
+            num_features,
+            eps,
+            momentum,
+            running_mean,
+            running_var,
+            weight: if affine {
+                Some(Tensor::ones(&[num_features]))
+            } else {
+                None
+            },
+            bias: if affine {
+                Some(Tensor::zeros(&[num_features]))
+            } else {
+                None
+            },
+            training: true,
+            input: None,
+        }
+    }
+
+    fn update_running_stats(&mut self, mean: &[f32], var: &[f32]) -> Result<(), BellandeError> {
+        let running_mean = Arc::get_mut(&mut self.running_mean).ok_or_else(|| {
+            BellandeError::RuntimeError("Failed to get mutable reference to running mean".into())
+        })?;
+
+        let running_var = Arc::get_mut(&mut self.running_var).ok_or_else(|| {
+            BellandeError::RuntimeError(
+                "Failed to get mutable reference to running variance".into(),
+            )
+        })?;
+
+        for c in 0..self.num_features {
+            running_mean.data[c] =
+                self.momentum * running_mean.data[c] + (1.0 - self.momentum) * mean[c];
+            running_var.data[c] =
+                self.momentum * running_var.data[c] + (1.0 - self.momentum) * var[c];
+        }
+
+        Ok(())
+    }
+
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor (batch_size, channels, height, width)".into(),
+            ));
+        }
+
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        if channels != self.num_features {
+            return Err(BellandeError::InvalidOperation(format!(
+                "Expected {} channels but got {}",
+                self.num_features, channels
+            )));
+        }
+
+        let mut output = input.data.clone();
+
+        if self.training {
+            let mut mean = vec![0.0; channels];
+            let mut var = vec![0.0; channels];
+            let size = batch_size * height * width;
+            let n = size as f32;
+
+            for c in 0..channels {
+                let mut sum = 0.0;
+                let mut sq_sum = 0.0;
+
+                for b in 0..batch_size {
+                    for h in 0..height {
+                        for w in 0..width {
+                            let idx = ((b * channels + c) * height + h) * width + w;
+                            let val = input.data[idx];
+                            sum += val;
+                            sq_sum += val * val;
+                        }
+                    }
+                }
+
+                mean[c] = sum / n;
+                var[c] = sq_sum / n - mean[c] * mean[c];
+            }
+
+            // Update running statistics
+            self.update_running_stats(&mean, &var)?;
+
+            // Normalize
+            for c in 0..channels {
+                let std = (var[c] + self.eps).sqrt();
+                for b in 0..batch_size {
+                    for h in 0..height {
+                        for w in 0..width {
+                            let idx = ((b * channels + c) * height + h) * width + w;
+                            output[idx] = (output[idx] - mean[c]) / std;
+
+                            if let Some(ref weight) = self.weight {
+                                output[idx] *= weight.data[c];
+                            }
+                            if let Some(ref bias) = self.bias {
+                                output[idx] += bias.data[c];
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            // Use running statistics
+            let running_mean = &self.running_mean;
+            let running_var = &self.running_var;
+
+            for c in 0..channels {
+                let std = (running_var.data[c] + self.eps).sqrt();
+                for b in 0..batch_size {
+                    for h in 0..height {
+                        for w in 0..width {
+                            let idx = ((b * channels + c) * height + h) * width + w;
+                            output[idx] = (output[idx] - running_mean.data[c]) / std;
+
+                            if let Some(ref weight) = self.weight {
+                                output[idx] *= weight.data[c];
+                            }
+                            if let Some(ref bias) = self.bias {
+                                output[idx] += bias.data[c];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+    fn backward_impl(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+        let spatial_size = height * width;
+        let n = (batch_size * spatial_size) as f32;
+
+        // Calculate mean and variance
+        let mut mean = vec![0.0; channels];
+        let mut var = vec![0.0; channels];
+
+        for c in 0..channels {
+            let mut sum = 0.0;
+            let mut sq_sum = 0.0;
+            for b in 0..batch_size {
+                for h in 0..height {
+                    for w in 0..width {
+                        let idx = ((b * channels + c) * height + h) * width + w;
+                        let val = input.data[idx];
+                        sum += val;
+                        sq_sum += val * val;
+                    }
+                }
+            }
+            mean[c] = sum / n;
+            var[c] = sq_sum / n - mean[c] * mean[c];
+        }
+
+        // Initialize gradients
+        let mut dx = vec![0.0; input.data.len()];
+        let mut dweight = if self.weight.is_some() {
+            vec![0.0; channels]
+        } else {
+            vec![]
+        };
+        let mut dbias = if self.bias.is_some() {
+            vec![0.0; channels]
+        } else {
+            vec![]
+        };
+
+        // Compute gradients for each channel
+        for c in 0..channels {
+            let std = (var[c] + self.eps).sqrt();
+            let inv_std = 1.0 / std;
+
+            let mut sum_dxhat = 0.0;
+            let mut sum_dxhat_x = 0.0;
+
+            // First pass: compute sums for the channel
+            for b in 0..batch_size {
+                for h in 0..height {
+                    for w in 0..width {
+                        let idx = ((b * channels + c) * height + h) * width + w;
+                        let xhat = (input.data[idx] - mean[c]) * inv_std;
+
+                        let dxhat = grad_output.data[idx]
+                            * if let Some(ref weight) = self.weight {
+                                weight.data[c]
+                            } else {
+                                1.0
+                            };
+
+                        sum_dxhat += dxhat;
+                        sum_dxhat_x += dxhat * xhat;
+                    }
+                }
+            }
+
+            // Second pass: compute dx for the channel
+            for b in 0..batch_size {
+                for h in 0..height {
+                    for w in 0..width {
+                        let idx = ((b * channels + c) * height + h) * width + w;
+                        let xhat = (input.data[idx] - mean[c]) * inv_std;
+
+                        let dxhat = grad_output.data[idx]
+                            * if let Some(ref weight) = self.weight {
+                                weight.data[c]
+                            } else {
+                                1.0
+                            };
+
+                        dx[idx] = inv_std * (dxhat - sum_dxhat / n - xhat * sum_dxhat_x / n);
+                    }
+                }
+            }
+
+            // Compute dweight and dbias if they exist
+            if let Some(_) = self.weight {
+                dweight[c] = 0.0;
+                for b in 0..batch_size {
+                    for h in 0..height {
+                        for w in 0..width {
+                            let idx = ((b * channels + c) * height + h) * width + w;
+                            let xhat = (input.data[idx] - mean[c]) * inv_std;
+                            dweight[c] += grad_output.data[idx] * xhat;
+                        }
+                    }
+                }
+            }
+
+            if let Some(_) = self.bias {
+                dbias[c] = 0.0;
+                for b in 0..batch_size {
+                    for h in 0..height {
+                        for w in 0..width {
+                            let idx = ((b * channels + c) * height + h) * width + w;
+                            dbias[c] += grad_output.data[idx];
+                        }
+                    }
+                }
+            }
+        }
+
+        // Update weight and bias gradients if they exist
+        if let Some(ref mut weight) = self.weight {
+            weight.grad = Some(dweight);
+        }
+
+        if let Some(ref mut bias) = self.bias {
+            bias.grad = Some(dbias);
+        }
+
+        Ok(Tensor::new(
+            dx,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+}
+
+// Implement NeuralLayer for BatchNorm1d
+impl NeuralLayer for BatchNorm1d {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        self.backward_impl(grad_output)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        let mut params = Vec::new();
+        if let Some(ref weight) = self.weight {
+            params.push(weight.clone());
+        }
+        if let Some(ref bias) = self.bias {
+            params.push(bias.clone());
+        }
+        params
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        let mut params = Vec::new();
+        if let Some(ref weight) = self.weight {
+            params.push(("weight".to_string(), weight.clone()));
+        }
+        if let Some(ref bias) = self.bias {
+            params.push(("bias".to_string(), bias.clone()));
+        }
+        params
+    }
+
+    fn set_parameter(&mut self, name: &str, value: Tensor) -> Result<(), BellandeError> {
+        match name {
+            "weight" => {
+                if let Some(ref weight) = self.weight {
+                    if value.shape == weight.shape {
+                        self.weight = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch("Weight shape mismatch".into()))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use weights".into(),
+                    ))
+                }
+            }
+            "bias" => {
+                if let Some(ref bias) = self.bias {
+                    if value.shape == bias.shape {
+                        self.bias = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch("Bias shape mismatch".into()))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use bias".into(),
+                    ))
+                }
+            }
+            _ => Err(BellandeError::InvalidParameter(format!(
+                "Unknown parameter name: {}",
+                name
+            ))),
+        }
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement NeuralLayer for BatchNorm2d
+impl NeuralLayer for BatchNorm2d {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        self.backward_impl(grad_output)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        let mut params = Vec::new();
+        if let Some(ref weight) = self.weight {
+            params.push(weight.clone());
+        }
+        if let Some(ref bias) = self.bias {
+            params.push(bias.clone());
+        }
+        params
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        let mut params = Vec::new();
+        if let Some(ref weight) = self.weight {
+            params.push(("weight".to_string(), weight.clone()));
+        }
+        if let Some(ref bias) = self.bias {
+            params.push(("bias".to_string(), bias.clone()));
+        }
+        params
+    }
+
+    fn set_parameter(&mut self, name: &str, value: Tensor) -> Result<(), BellandeError> {
+        match name {
+            "weight" => {
+                if let Some(ref weight) = self.weight {
+                    if value.shape == weight.shape {
+                        self.weight = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch("Weight shape mismatch".into()))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use weights".into(),
+                    ))
+                }
+            }
+            "bias" => {
+                if let Some(ref bias) = self.bias {
+                    if value.shape == bias.shape {
+                        self.bias = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch("Bias shape mismatch".into()))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use bias".into(),
+                    ))
+                }
+            }
+            _ => Err(BellandeError::InvalidParameter(format!(
+                "Unknown parameter name: {}",
+                name
+            ))),
+        }
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for BatchNorm1d {}
+unsafe impl Sync for BatchNorm1d {}
+unsafe impl Send for BatchNorm2d {}
+unsafe impl Sync for BatchNorm2d {}
diff --git a/src/layer/conv.rs b/src/layer/conv.rs
new file mode 100644
index 0000000..4807143
--- /dev/null
+++ b/src/layer/conv.rs
@@ -0,0 +1,427 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+
+pub struct Conv2d {
+    in_channels: usize,
+    out_channels: usize,
+    kernel_size: (usize, usize),
+    stride: (usize, usize),
+    padding: (usize, usize),
+    pub(crate) weight: Tensor,
+    pub(crate) bias: Option<Tensor>,
+    pub(crate) input: Option<Tensor>, // Changed from input_cache to input
+    pub(crate) weight_grad: Option<Tensor>,
+    pub(crate) bias_grad: Option<Tensor>,
+    pub(crate) training: bool,
+}
+
+impl Conv2d {
+    pub fn new(
+        in_channels: usize,
+        out_channels: usize,
+        kernel_size: (usize, usize),
+        stride: Option<(usize, usize)>,
+        padding: Option<(usize, usize)>,
+        bias: bool,
+    ) -> Self {
+        let stride = stride.unwrap_or(kernel_size);
+        let padding = padding.unwrap_or((0, 0));
+        let weight = Tensor::randn(&[out_channels, in_channels, kernel_size.0, kernel_size.1]);
+        let bias = if bias {
+            Some(Tensor::zeros(&[out_channels]))
+        } else {
+            None
+        };
+
+        Conv2d {
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            weight,
+            bias,
+            input: None,
+            weight_grad: None,
+            bias_grad: None,
+            training: true,
+        }
+    }
+
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor (batch_size, channels, height, width)".into(),
+            ));
+        }
+
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        if channels != self.in_channels {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        // Safe output dimension calculation
+        let output_height = ((height as i64 + 2 * self.padding.0 as i64
+            - self.kernel_size.0 as i64)
+            / self.stride.0 as i64
+            + 1) as usize;
+        let output_width = ((width as i64 + 2 * self.padding.1 as i64 - self.kernel_size.1 as i64)
+            / self.stride.1 as i64
+            + 1) as usize;
+
+        // Validate output dimensions
+        if output_height == 0 || output_width == 0 {
+            return Err(BellandeError::InvalidShape(
+                "Convolution resulted in zero output dimensions".into(),
+            ));
+        }
+
+        let mut output = vec![0.0; batch_size * self.out_channels * output_height * output_width];
+
+        // Implement convolution operation with bounds checking
+        for b in 0..batch_size {
+            for out_c in 0..self.out_channels {
+                for out_h in 0..output_height {
+                    for out_w in 0..output_width {
+                        let mut sum = 0.0;
+
+                        for in_c in 0..self.in_channels {
+                            for k_h in 0..self.kernel_size.0 {
+                                for k_w in 0..self.kernel_size.1 {
+                                    // Safe input position calculation with padding
+                                    let in_h = out_h
+                                        .checked_mul(self.stride.0)
+                                        .and_then(|h| h.checked_add(k_h))
+                                        .and_then(|h| h.checked_sub(self.padding.0));
+
+                                    let in_w = out_w
+                                        .checked_mul(self.stride.1)
+                                        .and_then(|w| w.checked_add(k_w))
+                                        .and_then(|w| w.checked_sub(self.padding.1));
+
+                                    // Check if the input position is valid
+                                    if let (Some(h), Some(w)) = (in_h, in_w) {
+                                        if h < height && w < width {
+                                            let input_idx =
+                                                ((b * channels + in_c) * height + h) * width + w;
+                                            let weight_idx = ((out_c * self.in_channels + in_c)
+                                                * self.kernel_size.0
+                                                + k_h)
+                                                * self.kernel_size.1
+                                                + k_w;
+
+                                            if input_idx < input.data.len()
+                                                && weight_idx < self.weight.data.len()
+                                            {
+                                                sum += input.data[input_idx]
+                                                    * self.weight.data[weight_idx];
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+
+                        if let Some(ref bias) = self.bias {
+                            if out_c < bias.data.len() {
+                                sum += bias.data[out_c];
+                            }
+                        }
+
+                        let output_idx = ((b * self.out_channels + out_c) * output_height + out_h)
+                            * output_width
+                            + out_w;
+                        if output_idx < output.len() {
+                            output[output_idx] = sum;
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            vec![batch_size, self.out_channels, output_height, output_width],
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward_input(
+        &self,
+        grad_output: &Tensor,
+        input: &Tensor,
+    ) -> Result<Tensor, BellandeError> {
+        let (batch_size, _, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        let mut grad_input = vec![0.0; input.data.len()];
+        let (_, _, output_height, output_width) = (
+            grad_output.shape[0],
+            grad_output.shape[1],
+            grad_output.shape[2],
+            grad_output.shape[3],
+        );
+
+        // Compute input gradients
+        for b in 0..batch_size {
+            for out_c in 0..self.out_channels {
+                for out_h in 0..output_height {
+                    for out_w in 0..output_width {
+                        let out_idx = ((b * self.out_channels + out_c) * output_height + out_h)
+                            * output_width
+                            + out_w;
+                        let grad = grad_output.data[out_idx];
+
+                        for in_c in 0..self.in_channels {
+                            for k_h in 0..self.kernel_size.0 {
+                                for k_w in 0..self.kernel_size.1 {
+                                    let in_h = out_h * self.stride.0 + k_h - self.padding.0;
+                                    let in_w = out_w * self.stride.1 + k_w - self.padding.1;
+
+                                    if in_h < height && in_w < width {
+                                        let input_idx =
+                                            ((b * self.in_channels + in_c) * height + in_h) * width
+                                                + in_w;
+                                        let weight_idx = ((out_c * self.in_channels + in_c)
+                                            * self.kernel_size.0
+                                            + k_h)
+                                            * self.kernel_size.1
+                                            + k_w;
+                                        grad_input[input_idx] +=
+                                            grad * self.weight.data[weight_idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            grad_input,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward_weight(
+        &self,
+        grad_output: &Tensor,
+        input: &Tensor,
+    ) -> Result<Tensor, BellandeError> {
+        let mut grad_weight = vec![0.0; self.weight.data.len()];
+        let (batch_size, _, output_height, output_width) = (
+            grad_output.shape[0],
+            grad_output.shape[1],
+            grad_output.shape[2],
+            grad_output.shape[3],
+        );
+
+        let (_, _, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        // Compute weight gradients
+        for b in 0..batch_size {
+            for out_c in 0..self.out_channels {
+                for out_h in 0..output_height {
+                    for out_w in 0..output_width {
+                        let out_idx = ((b * self.out_channels + out_c) * output_height + out_h)
+                            * output_width
+                            + out_w;
+                        let grad = grad_output.data[out_idx];
+
+                        for in_c in 0..self.in_channels {
+                            for k_h in 0..self.kernel_size.0 {
+                                for k_w in 0..self.kernel_size.1 {
+                                    let in_h = out_h * self.stride.0 + k_h - self.padding.0;
+                                    let in_w = out_w * self.stride.1 + k_w - self.padding.1;
+
+                                    if in_h < height && in_w < width {
+                                        let input_idx =
+                                            ((b * self.in_channels + in_c) * height + in_h) * width
+                                                + in_w;
+                                        let weight_idx = ((out_c * self.in_channels + in_c)
+                                            * self.kernel_size.0
+                                            + k_h)
+                                            * self.kernel_size.1
+                                            + k_w;
+                                        grad_weight[weight_idx] += grad * input.data[input_idx];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            grad_weight,
+            self.weight.shape.clone(),
+            true,
+            self.weight.device.clone(),
+            self.weight.dtype,
+        ))
+    }
+
+    fn backward_bias(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.bias.is_none() {
+            return Err(BellandeError::InvalidParameter("No bias present".into()));
+        }
+
+        let mut grad_bias = vec![0.0; self.out_channels];
+        let (batch_size, _, output_height, output_width) = (
+            grad_output.shape[0],
+            grad_output.shape[1],
+            grad_output.shape[2],
+            grad_output.shape[3],
+        );
+
+        // Compute bias gradients
+        for b in 0..batch_size {
+            for out_c in 0..self.out_channels {
+                for out_h in 0..output_height {
+                    for out_w in 0..output_width {
+                        let out_idx = ((b * self.out_channels + out_c) * output_height + out_h)
+                            * output_width
+                            + out_w;
+                        grad_bias[out_c] += grad_output.data[out_idx];
+                    }
+                }
+            }
+        }
+
+        Ok(Tensor::new(
+            grad_bias,
+            vec![self.out_channels],
+            true,
+            self.weight.device.clone(),
+            self.weight.dtype,
+        ))
+    }
+}
+
+impl NeuralLayer for Conv2d {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+
+        let grad_input = self.backward_input(grad_output, input)?;
+        let grad_weight = self.backward_weight(grad_output, input)?;
+        let grad_bias = if self.bias.is_some() {
+            Some(self.backward_bias(grad_output)?)
+        } else {
+            None
+        };
+
+        // Store gradients
+        self.weight_grad = Some(grad_weight);
+        self.bias_grad = grad_bias;
+
+        Ok(grad_input)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        let mut params = vec![self.weight.clone()];
+        if let Some(ref bias) = self.bias {
+            params.push(bias.clone());
+        }
+        params
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        let mut params = vec![("weight".to_string(), self.weight.clone())];
+        if let Some(ref bias) = self.bias {
+            params.push(("bias".to_string(), bias.clone()));
+        }
+        params
+    }
+
+    fn set_parameter(&mut self, name: &str, value: Tensor) -> Result<(), BellandeError> {
+        match name {
+            "weight" => {
+                if value.shape == self.weight.shape {
+                    self.weight = value;
+                    Ok(())
+                } else {
+                    Err(BellandeError::ShapeMismatch(
+                        "Weight shape mismatch".to_string(),
+                    ))
+                }
+            }
+            "bias" => {
+                if let Some(ref bias) = self.bias {
+                    if value.shape == bias.shape {
+                        self.bias = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch(
+                            "Bias shape mismatch".to_string(),
+                        ))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use bias".to_string(),
+                    ))
+                }
+            }
+            _ => Err(BellandeError::InvalidParameter(format!(
+                "Unknown parameter name: {}",
+                name
+            ))),
+        }
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for Conv2d {}
+unsafe impl Sync for Conv2d {}
diff --git a/src/layer/dropout.rs b/src/layer/dropout.rs
new file mode 100644
index 0000000..91cf1b3
--- /dev/null
+++ b/src/layer/dropout.rs
@@ -0,0 +1,135 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+use rand::Rng;
+
+pub struct Dropout {
+    p: f32,
+    mask: Option<Vec<bool>>,
+    pub(crate) training: bool,
+    input: Option<Tensor>,
+}
+
+impl Dropout {
+    pub fn new(p: f32) -> Result<Self, BellandeError> {
+        if !(0.0..1.0).contains(&p) {
+            return Err(BellandeError::InvalidParameter(
+                "Dropout probability must be between 0 and 1".into(),
+            ));
+        }
+
+        Ok(Dropout {
+            p,
+            mask: None,
+            training: true,
+            input: None,
+        })
+    }
+
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if !self.training {
+            return Ok(input.clone());
+        }
+
+        let mut rng = rand::thread_rng();
+        let mask: Vec<bool> = (0..input.data.len())
+            .map(|_| rng.gen::<f32>() > self.p)
+            .collect();
+
+        let scale = 1.0 / (1.0 - self.p);
+        let output: Vec<f32> = input
+            .data
+            .iter()
+            .zip(mask.iter())
+            .map(|(&x, &m)| if m { x * scale } else { 0.0 })
+            .collect();
+
+        self.mask = Some(mask);
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn backward_input(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let mask = self.mask.as_ref().ok_or_else(|| {
+            BellandeError::InvalidBackward("Forward pass not called before backward".into())
+        })?;
+
+        let scale = 1.0 / (1.0 - self.p);
+        let grad: Vec<f32> = grad_output
+            .data
+            .iter()
+            .zip(mask.iter())
+            .map(|(&g, &m)| if m { g * scale } else { 0.0 })
+            .collect();
+
+        Ok(Tensor::new(
+            grad,
+            grad_output.shape.clone(),
+            true,
+            grad_output.device.clone(),
+            grad_output.dtype,
+        ))
+    }
+}
+
+impl NeuralLayer for Dropout {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let _input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+
+        self.backward_input(grad_output)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        Vec::new() // Dropout has no learnable parameters
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        Vec::new() // Dropout has no learnable parameters
+    }
+
+    fn set_parameter(&mut self, _name: &str, _value: Tensor) -> Result<(), BellandeError> {
+        Err(BellandeError::InvalidParameter(
+            "Dropout has no learnable parameters".to_string(),
+        ))
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for Dropout {}
+unsafe impl Sync for Dropout {}
diff --git a/src/layer/layer_norm.rs b/src/layer/layer_norm.rs
new file mode 100644
index 0000000..0e4a033
--- /dev/null
+++ b/src/layer/layer_norm.rs
@@ -0,0 +1,196 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+pub struct LayerNorm {
+    normalized_shape: Vec<usize>,
+    weight: Option<Tensor>,
+    bias: Option<Tensor>,
+    eps: f32,
+    input_cache: Option<LayerNormCache>,
+}
+
+struct LayerNormCache {
+    input: Tensor,
+    normalized: Tensor,
+    std: Vec<f32>,
+    mean: Vec<f32>,
+}
+
+impl LayerNorm {
+    pub fn new(normalized_shape: Vec<usize>, eps: f32, elementwise_affine: bool) -> Self {
+        let weight = if elementwise_affine {
+            Some(Tensor::ones(&normalized_shape))
+        } else {
+            None
+        };
+
+        let bias = if elementwise_affine {
+            Some(Tensor::zeros(&normalized_shape))
+        } else {
+            None
+        };
+
+        LayerNorm {
+            normalized_shape,
+            weight,
+            bias,
+            eps,
+            input_cache: None,
+        }
+    }
+
+    pub fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let batch_size = input.shape[0];
+        let feature_size: usize = self.normalized_shape.iter().product();
+
+        let input_shape: Vec<usize> = input.shape[1..].to_vec();
+        let norm_shape: Vec<usize> = self.normalized_shape.clone();
+
+        if input_shape != norm_shape {
+            return Err(BellandeError::InvalidShape(format!(
+                "Expected shape {:?}, got {:?}",
+                norm_shape, input_shape
+            )));
+        }
+        let mut output = input.data.clone();
+        let mut mean = vec![0.0; batch_size];
+        let mut std = vec![0.0; batch_size];
+
+        // Calculate mean and standard deviation
+        for b in 0..batch_size {
+            let start_idx = b * feature_size;
+            let end_idx = start_idx + feature_size;
+            let batch_data = &input.data[start_idx..end_idx];
+
+            // Calculate mean
+            mean[b] = batch_data.iter().sum::<f32>() / feature_size as f32;
+
+            // Calculate variance
+            let variance: f32 = batch_data
+                .iter()
+                .map(|&x| (x - mean[b]).powi(2))
+                .sum::<f32>()
+                / feature_size as f32;
+
+            std[b] = (variance + self.eps).sqrt();
+
+            // Normalize
+            for i in 0..feature_size {
+                let idx = start_idx + i;
+                output[idx] = (input.data[idx] - mean[b]) / std[b];
+
+                // Apply affine transform if available
+                if let (Some(ref weight), Some(ref bias)) = (&self.weight, &self.bias) {
+                    output[idx] = output[idx] * weight.data[i] + bias.data[i];
+                }
+            }
+        }
+
+        // Cache for backward pass
+        self.input_cache = Some(LayerNormCache {
+            input: input.clone(),
+            normalized: Tensor::new(
+                output.clone(),
+                input.shape.clone(),
+                true,
+                input.device.clone(),
+                input.dtype,
+            ),
+            std,
+            mean,
+        });
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        if let Some(ref cache) = self.input_cache {
+            let batch_size = grad_output.shape[0];
+            let feature_size = self.normalized_shape.iter().product();
+            let mut grad_input = vec![0.0; grad_output.data.len()];
+
+            for b in 0..batch_size {
+                let start_idx = b * feature_size;
+                let end_idx = start_idx + feature_size;
+
+                let batch_grad = &grad_output.data[start_idx..end_idx];
+                let batch_input = &cache.input.data[start_idx..end_idx];
+                let mean = cache.mean[b];
+                let std = cache.std[b];
+
+                // Calculate gradients
+                let mut sum_grad = 0.0;
+                let mut sum_grad_h = 0.0;
+
+                for i in 0..feature_size {
+                    let idx = start_idx + i;
+                    let h = (batch_input[i] - mean) / std;
+
+                    if let (Some(ref weight), Some(ref bias)) = (&self.weight, &self.bias) {
+                        sum_grad += grad_output.data[idx] * weight.data[i];
+                        sum_grad_h += grad_output.data[idx] * weight.data[i] * h;
+                    } else {
+                        sum_grad += grad_output.data[idx];
+                        sum_grad_h += grad_output.data[idx] * h;
+                    }
+                }
+
+                // Apply gradients
+                for i in 0..feature_size {
+                    let idx = start_idx + i;
+                    let h = (batch_input[i] - mean) / std;
+
+                    grad_input[idx] = (1.0 / (feature_size as f32 * std))
+                        * (feature_size as f32 * grad_output.data[idx] - sum_grad - h * sum_grad_h);
+
+                    if let (Some(ref weight), _) = (&self.weight, &self.bias) {
+                        grad_input[idx] *= weight.data[i];
+                    }
+                }
+            }
+
+            Ok(Tensor::new(
+                grad_input,
+                grad_output.shape.clone(),
+                true,
+                grad_output.device.clone(),
+                grad_output.dtype,
+            ))
+        } else {
+            Err(BellandeError::RuntimeError(
+                "Forward pass not called".into(),
+            ))
+        }
+    }
+
+    pub fn parameters(&self) -> Vec<Tensor> {
+        let mut params = Vec::new();
+        if let Some(ref weight) = self.weight {
+            params.push(weight.clone());
+        }
+        if let Some(ref bias) = self.bias {
+            params.push(bias.clone());
+        }
+        params
+    }
+}
diff --git a/src/layer/linear.rs b/src/layer/linear.rs
new file mode 100644
index 0000000..0498964
--- /dev/null
+++ b/src/layer/linear.rs
@@ -0,0 +1,234 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+
+pub struct Linear {
+    in_features: usize,
+    out_features: usize,
+    pub(crate) weight: Tensor,
+    pub(crate) bias: Option<Tensor>,
+    input_cache: Option<Tensor>,
+    pub(crate) weight_grad: Option<Tensor>,
+    pub(crate) bias_grad: Option<Tensor>,
+    pub(crate) training: bool,
+}
+
+impl Linear {
+    pub fn new(in_features: usize, out_features: usize, bias: bool) -> Self {
+        let weight = Tensor::randn(&[out_features, in_features]);
+        let bias = if bias {
+            Some(Tensor::zeros(&[out_features]))
+        } else {
+            None
+        };
+
+        Linear {
+            in_features,
+            out_features,
+            weight,
+            bias,
+            input_cache: None,
+            weight_grad: None,
+            bias_grad: None,
+            training: true,
+        }
+    }
+
+    pub fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 2 {
+            return Err(BellandeError::InvalidShape(format!("Linear Invalid")))?;
+        }
+
+        let batch_size = input.shape[0];
+        if input.shape[1] != self.in_features {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut output = vec![0.0; batch_size * self.out_features];
+
+        for b in 0..batch_size {
+            for o in 0..self.out_features {
+                let mut sum = 0.0;
+                for i in 0..self.in_features {
+                    sum += input.data[b * self.in_features + i]
+                        * self.weight.data[o * self.in_features + i];
+                }
+                if let Some(ref bias) = self.bias {
+                    sum += bias.data[o];
+                }
+                output[b * self.out_features + o] = sum;
+            }
+        }
+
+        self.input_cache = Some(input.clone());
+
+        Ok(Tensor::new(
+            output,
+            vec![batch_size, self.out_features],
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+}
+
+impl NeuralLayer for Linear {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        self.forward(input)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let (grad_input, grad_weight, grad_bias) = self.compute_gradients(grad_output)?;
+
+        // Store gradients
+        self.weight_grad = Some(grad_weight);
+        self.bias_grad = grad_bias;
+
+        Ok(grad_input)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        let mut params = vec![self.weight.clone()];
+        if let Some(ref bias) = self.bias {
+            params.push(bias.clone());
+        }
+        params
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        let mut params = vec![("weight".to_string(), self.weight.clone())];
+        if let Some(ref bias) = self.bias {
+            params.push(("bias".to_string(), bias.clone()));
+        }
+        params
+    }
+
+    fn set_parameter(&mut self, name: &str, value: Tensor) -> Result<(), BellandeError> {
+        match name {
+            "weight" => {
+                if value.shape == self.weight.shape {
+                    self.weight = value;
+                    Ok(())
+                } else {
+                    Err(BellandeError::ShapeMismatch(
+                        "Weight shape mismatch".to_string(),
+                    ))
+                }
+            }
+            "bias" => {
+                if let Some(ref bias) = self.bias {
+                    if value.shape == bias.shape {
+                        self.bias = Some(value);
+                        Ok(())
+                    } else {
+                        Err(BellandeError::ShapeMismatch(
+                            "Bias shape mismatch".to_string(),
+                        ))
+                    }
+                } else {
+                    Err(BellandeError::InvalidParameter(
+                        "Layer does not use bias".to_string(),
+                    ))
+                }
+            }
+            _ => Err(BellandeError::InvalidParameter(format!(
+                "Unknown parameter name: {}",
+                name
+            ))),
+        }
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+impl Linear {
+    fn compute_gradients(
+        &self,
+        grad_output: &Tensor,
+    ) -> Result<(Tensor, Tensor, Option<Tensor>), BellandeError> {
+        if let Some(ref input) = self.input_cache {
+            let batch_size = grad_output.shape[0];
+
+            // Gradient with respect to input
+            let mut grad_input = vec![0.0; input.data.len()];
+            // Gradient with respect to weight
+            let mut grad_weight = vec![0.0; self.weight.data.len()];
+            // Gradient with respect to bias
+            let mut grad_bias = if self.bias.is_some() {
+                Some(vec![0.0; self.out_features])
+            } else {
+                None
+            };
+
+            // Compute gradients
+            for b in 0..batch_size {
+                for o in 0..self.out_features {
+                    for i in 0..self.in_features {
+                        let grad = grad_output.data[b * self.out_features + o];
+                        grad_input[b * self.in_features + i] +=
+                            grad * self.weight.data[o * self.in_features + i];
+                        grad_weight[o * self.in_features + i] +=
+                            grad * input.data[b * self.in_features + i];
+                    }
+                    if let Some(ref mut bias) = grad_bias {
+                        bias[o] += grad_output.data[b * self.out_features + o];
+                    }
+                }
+            }
+
+            Ok((
+                Tensor::new(
+                    grad_input,
+                    input.shape.clone(),
+                    true,
+                    input.device.clone(),
+                    input.dtype,
+                ),
+                Tensor::new(
+                    grad_weight,
+                    self.weight.shape.clone(),
+                    true,
+                    self.weight.device.clone(),
+                    self.weight.dtype,
+                ),
+                grad_bias.map(|bias| {
+                    Tensor::new(
+                        bias,
+                        vec![self.out_features],
+                        true,
+                        self.weight.device.clone(),
+                        self.weight.dtype,
+                    )
+                }),
+            ))
+        } else {
+            Err(BellandeError::RuntimeError(
+                "Forward pass not called".into(),
+            ))
+        }
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for Linear {}
+unsafe impl Sync for Linear {}
diff --git a/src/layer/mod.rs b/src/layer/mod.rs
new file mode 100644
index 0000000..495b0c6
--- /dev/null
+++ b/src/layer/mod.rs
@@ -0,0 +1,10 @@
+pub mod activation;
+pub mod avgpool2d;
+pub mod batch_norm;
+pub mod conv;
+pub mod dropout;
+pub mod layer_norm;
+pub mod linear;
+pub mod pooling;
+pub mod recurrent;
+pub mod transformer;
diff --git a/src/layer/pooling.rs b/src/layer/pooling.rs
new file mode 100644
index 0000000..a59ddfa
--- /dev/null
+++ b/src/layer/pooling.rs
@@ -0,0 +1,163 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::sequential::NeuralLayer;
+
+pub struct MaxPool2d {
+    kernel_size: (usize, usize),
+    stride: (usize, usize),
+    indices: Option<Vec<usize>>,
+    pub(crate) input: Option<Tensor>,
+    training: bool,
+}
+
+impl MaxPool2d {
+    pub fn new(kernel_size: (usize, usize), stride: Option<(usize, usize)>) -> Self {
+        let stride = stride.unwrap_or(kernel_size);
+
+        MaxPool2d {
+            kernel_size,
+            stride,
+            indices: None,
+            input: None,
+            training: true,
+        }
+    }
+
+    fn forward_impl(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if input.shape.len() != 4 {
+            return Err(BellandeError::InvalidShape(
+                "Expected 4D tensor (batch_size, channels, height, width)".into(),
+            ));
+        }
+
+        let (batch_size, channels, height, width) = (
+            input.shape[0],
+            input.shape[1],
+            input.shape[2],
+            input.shape[3],
+        );
+
+        let output_height = (height - self.kernel_size.0) / self.stride.0 + 1;
+        let output_width = (width - self.kernel_size.1) / self.stride.1 + 1;
+
+        let mut output = vec![0.0; batch_size * channels * output_height * output_width];
+        let mut indices = vec![0; batch_size * channels * output_height * output_width];
+
+        for b in 0..batch_size {
+            for c in 0..channels {
+                for h in 0..output_height {
+                    for w in 0..output_width {
+                        let mut max_val = f32::NEG_INFINITY;
+                        let mut max_idx = 0;
+
+                        for kh in 0..self.kernel_size.0 {
+                            for kw in 0..self.kernel_size.1 {
+                                let in_h = h * self.stride.0 + kh;
+                                let in_w = w * self.stride.1 + kw;
+                                let idx = ((b * channels + c) * height + in_h) * width + in_w;
+                                let val = input.data[idx];
+                                if val > max_val {
+                                    max_val = val;
+                                    max_idx = idx;
+                                }
+                            }
+                        }
+
+                        let out_idx = ((b * channels + c) * output_height + h) * output_width + w;
+                        output[out_idx] = max_val;
+                        indices[out_idx] = max_idx;
+                    }
+                }
+            }
+        }
+
+        self.indices = Some(indices);
+
+        Ok(Tensor::new(
+            output,
+            vec![batch_size, channels, output_height, output_width],
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward_input(
+        &self,
+        grad_output: &Tensor,
+        input: &Tensor,
+    ) -> Result<Tensor, BellandeError> {
+        let indices = self.indices.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+
+        let mut grad_input = vec![0.0; input.data.len()];
+
+        for (out_idx, &in_idx) in indices.iter().enumerate() {
+            grad_input[in_idx] += grad_output.data[out_idx];
+        }
+
+        Ok(Tensor::new(
+            grad_input,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+}
+
+impl NeuralLayer for MaxPool2d {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let output = self.forward_impl(input)?;
+        self.input = Some(input.clone());
+        Ok(output)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        let input = self.input.as_ref().ok_or(BellandeError::InvalidBackward(
+            "Forward pass not called before backward".into(),
+        ))?;
+        self.backward_input(grad_output, input)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        Vec::new() // MaxPool2d has no learnable parameters
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        Vec::new() // MaxPool2d has no learnable parameters
+    }
+
+    fn set_parameter(&mut self, _name: &str, _value: Tensor) -> Result<(), BellandeError> {
+        Err(BellandeError::InvalidParameter(
+            "MaxPool2d has no learnable parameters".to_string(),
+        ))
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for MaxPool2d {}
+unsafe impl Sync for MaxPool2d {}
diff --git a/src/layer/recurrent.rs b/src/layer/recurrent.rs
new file mode 100644
index 0000000..7ee5e6a
--- /dev/null
+++ b/src/layer/recurrent.rs
@@ -0,0 +1,241 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+pub struct LSTMCell {
+    input_size: usize,
+    hidden_size: usize,
+    weight_ih: Tensor, // Input-hidden weights
+    weight_hh: Tensor, // Hidden-hidden weights
+    bias_ih: Option<Tensor>,
+    bias_hh: Option<Tensor>,
+    cache: Option<LSTMCache>,
+}
+
+struct LSTMCache {
+    input: Tensor,
+    hidden: Tensor,
+    cell: Tensor,
+    gates: Tensor,
+}
+
+impl LSTMCell {
+    pub fn new(input_size: usize, hidden_size: usize, bias: bool) -> Self {
+        let weight_ih = Tensor::randn(&[4 * hidden_size, input_size]);
+        let weight_hh = Tensor::randn(&[4 * hidden_size, hidden_size]);
+
+        let bias_ih = if bias {
+            Some(Tensor::zeros(&[4 * hidden_size]))
+        } else {
+            None
+        };
+
+        let bias_hh = if bias {
+            Some(Tensor::zeros(&[4 * hidden_size]))
+        } else {
+            None
+        };
+
+        LSTMCell {
+            input_size,
+            hidden_size,
+            weight_ih,
+            weight_hh,
+            bias_ih,
+            bias_hh,
+            cache: None,
+        }
+    }
+
+    fn compute_gates(&self, input: &Tensor, h_prev: &Tensor) -> Result<Tensor, BellandeError> {
+        let ih = input.matmul(&self.weight_ih.transpose()?)?;
+        let hh = h_prev.matmul(&self.weight_hh.transpose()?)?;
+
+        let mut gates = ih.add(&hh)?;
+
+        if let Some(ref bias_ih) = self.bias_ih {
+            gates = gates.add(bias_ih)?;
+        }
+        if let Some(ref bias_hh) = self.bias_hh {
+            gates = gates.add(bias_hh)?;
+        }
+
+        Ok(gates)
+    }
+
+    fn split_gates(&self, gates: &Tensor) -> Result<Vec<Tensor>, BellandeError> {
+        let chunk_size = self.hidden_size;
+        let mut chunks = Vec::with_capacity(4);
+
+        for i in 0..4 {
+            let start = i * chunk_size;
+            chunks.push(gates.narrow(1, start, chunk_size)?);
+        }
+
+        Ok(chunks)
+    }
+
+    pub fn forward(
+        &mut self,
+        input: &Tensor,
+        hidden: Option<(Tensor, Tensor)>,
+    ) -> Result<(Tensor, Tensor), BellandeError> {
+        let batch_size = input.shape[0];
+
+        let (h_prev, c_prev) = match hidden {
+            Some((h, c)) => (h, c),
+            None => (
+                Tensor::zeros(&[batch_size, self.hidden_size]),
+                Tensor::zeros(&[batch_size, self.hidden_size]),
+            ),
+        };
+
+        // Calculate gates
+        let gates = self.compute_gates(input, &h_prev)?;
+
+        // Split gates into i, f, g, o
+        let chunks = self.split_gates(&gates)?;
+        let (i_gate, f_gate, g_gate, o_gate) = (&chunks[0], &chunks[1], &chunks[2], &chunks[3]);
+
+        // Apply gate operations
+        let f_c = f_gate.mul(&c_prev)?;
+        let i_g = i_gate.mul(g_gate)?;
+        let c_next = f_c.add(&i_g)?;
+
+        let c_tanh = c_next.tanh()?;
+        let h_next = o_gate.mul(&c_tanh)?;
+
+        // Cache for backward
+        self.cache = Some(LSTMCache {
+            input: input.clone(),
+            hidden: h_prev,
+            cell: c_prev,
+            gates,
+        });
+
+        Ok((h_next, c_next))
+    }
+}
+
+pub struct GRUCell {
+    input_size: usize,
+    hidden_size: usize,
+    weight_ih: Tensor,
+    weight_hh: Tensor,
+    bias_ih: Option<Tensor>,
+    bias_hh: Option<Tensor>,
+    cache: Option<GRUCache>,
+}
+
+struct GRUCache {
+    input: Tensor,
+    hidden: Tensor,
+    gates: Tensor,
+}
+
+impl GRUCell {
+    pub fn new(input_size: usize, hidden_size: usize, bias: bool) -> Self {
+        let weight_ih = Tensor::randn(&[3 * hidden_size, input_size]);
+        let weight_hh = Tensor::randn(&[3 * hidden_size, hidden_size]);
+
+        let bias_ih = if bias {
+            Some(Tensor::zeros(&[3 * hidden_size]))
+        } else {
+            None
+        };
+
+        let bias_hh = if bias {
+            Some(Tensor::zeros(&[3 * hidden_size]))
+        } else {
+            None
+        };
+
+        GRUCell {
+            input_size,
+            hidden_size,
+            weight_ih,
+            weight_hh,
+            bias_ih,
+            bias_hh,
+            cache: None,
+        }
+    }
+
+    fn compute_gates(&self, input: &Tensor, h_prev: &Tensor) -> Result<Tensor, BellandeError> {
+        let ih = input.matmul(&self.weight_ih.transpose()?)?;
+        let hh = h_prev.matmul(&self.weight_hh.transpose()?)?;
+
+        let mut gates = ih.add(&hh)?;
+
+        if let Some(ref bias_ih) = self.bias_ih {
+            gates = gates.add(bias_ih)?;
+        }
+        if let Some(ref bias_hh) = self.bias_hh {
+            gates = gates.add(bias_hh)?;
+        }
+
+        Ok(gates)
+    }
+
+    fn split_gates(&self, gates: &Tensor) -> Vec<Tensor> {
+        let chunk_size = self.hidden_size;
+        let mut chunks = Vec::with_capacity(3);
+
+        for i in 0..3 {
+            let start = i * chunk_size;
+            if let Ok(chunk) = gates.narrow(1, start, chunk_size) {
+                chunks.push(chunk);
+            }
+        }
+
+        chunks
+    }
+
+    fn forward(&mut self, input: &Tensor, hidden: Option<Tensor>) -> Result<Tensor, BellandeError> {
+        let batch_size = input.shape[0];
+
+        let h_prev = match hidden {
+            Some(h) => h,
+            None => Tensor::zeros(&[batch_size, self.hidden_size]),
+        };
+
+        // Calculate gates
+        let gates = self.compute_gates(input, &h_prev)?;
+        let chunks = self.split_gates(&gates);
+
+        if chunks.len() != 3 {
+            return Err(BellandeError::RuntimeError("Failed to split gates".into()));
+        }
+
+        let (r_gate, z_gate, n_gate) = (&chunks[0], &chunks[1], &chunks[2]);
+
+        // Apply GRU update
+        let ones = Tensor::ones(&z_gate.shape);
+        let z_complement = ones.sub(z_gate)?;
+        let zh = z_gate.mul(&h_prev)?;
+        let zn = z_complement.mul(n_gate)?;
+        let h_next = zh.add(&zn)?;
+
+        // Cache for backward
+        self.cache = Some(GRUCache {
+            input: input.clone(),
+            hidden: h_prev,
+            gates,
+        });
+
+        Ok(h_next)
+    }
+}
diff --git a/src/layer/transformer.rs b/src/layer/transformer.rs
new file mode 100644
index 0000000..0de97e0
--- /dev/null
+++ b/src/layer/transformer.rs
@@ -0,0 +1,259 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::layer::dropout::Dropout;
+use crate::layer::linear::Linear;
+use crate::layer::{activation::ReLU, layer_norm::LayerNorm};
+use crate::models::sequential::{NeuralLayer, Sequential};
+
+pub struct MultiHeadAttention {
+    num_heads: usize,
+    head_dim: usize,
+    q_proj: Linear,
+    k_proj: Linear,
+    v_proj: Linear,
+    out_proj: Linear,
+    dropout: Dropout,
+    cache: Option<AttentionCache>,
+}
+
+struct AttentionCache {
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    attention_weights: Tensor,
+}
+
+impl MultiHeadAttention {
+    pub fn new(
+        embed_dim: usize,
+        num_heads: usize,
+        dropout_rate: f32,
+    ) -> Result<Self, BellandeError> {
+        assert!(
+            embed_dim % num_heads == 0,
+            "Embedding dimension must be divisible by number of heads"
+        );
+
+        let head_dim = embed_dim / num_heads;
+
+        Ok(MultiHeadAttention {
+            num_heads,
+            head_dim,
+            q_proj: Linear::new(embed_dim, embed_dim, true),
+            k_proj: Linear::new(embed_dim, embed_dim, true),
+            v_proj: Linear::new(embed_dim, embed_dim, true),
+            out_proj: Linear::new(embed_dim, embed_dim, true),
+            dropout: Dropout::new(dropout_rate)?,
+            cache: None,
+        })
+    }
+
+    pub fn forward(
+        &mut self,
+        query: &Tensor,
+        key: &Tensor,
+        value: &Tensor,
+        mask: Option<&Tensor>,
+    ) -> Result<Tensor, BellandeError> {
+        let batch_size = query.shape[0];
+        let tgt_len = query.shape[1];
+        let src_len = key.shape[1];
+
+        // Linear projections
+        let q = self.q_proj.forward(query)?;
+        let k = self.k_proj.forward(key)?;
+        let v = self.v_proj.forward(value)?;
+
+        // Reshape and transpose for multi-head attention
+        let mut q = q.reshape(&[batch_size, tgt_len, self.num_heads, self.head_dim])?;
+        q = q.permute(&[0, 2, 1, 3])?; // batch, heads, tgt_len, head_dim
+
+        let mut k = k.reshape(&[batch_size, src_len, self.num_heads, self.head_dim])?;
+        k = k.permute(&[0, 2, 1, 3])?; // batch, heads, src_len, head_dim
+
+        let mut v = v.reshape(&[batch_size, src_len, self.num_heads, self.head_dim])?;
+        v = v.permute(&[0, 2, 1, 3])?; // batch, heads, src_len, head_dim
+
+        // Calculate attention scores
+        let scale = (self.head_dim as f32).sqrt();
+        let k_t = k.permute(&[0, 1, 3, 2])?; // transpose last two dimensions
+        let mut attention_weights = q.matmul(&k_t)?;
+        attention_weights = attention_weights.scale(1.0 / scale)?;
+
+        // Apply mask if provided
+        attention_weights = if let Some(mask) = mask {
+            attention_weights.masked_fill(mask, f32::NEG_INFINITY)?
+        } else {
+            attention_weights
+        };
+
+        // Apply softmax and dropout
+        attention_weights = attention_weights.softmax(-1)?;
+        attention_weights = NeuralLayer::forward(&mut self.dropout, &attention_weights)?;
+
+        // Apply attention to values
+        let mut output = attention_weights.matmul(&v)?;
+
+        // Reshape back
+        output = output.permute(&[0, 2, 1, 3])?; // batch, tgt_len, heads, head_dim
+        output = output.reshape(&[batch_size, tgt_len, self.num_heads * self.head_dim])?;
+
+        // Final projection
+        let output = self.out_proj.forward(&output)?;
+
+        // Cache for backward pass
+        self.cache = Some(AttentionCache {
+            query: query.clone(),
+            key: key.clone(),
+            value: value.clone(),
+            attention_weights,
+        });
+
+        Ok(output)
+    }
+}
+
+pub struct TransformerEncoderLayer {
+    self_attn: MultiHeadAttention,
+    ff_network: Sequential,
+    norm1: LayerNorm,
+    norm2: LayerNorm,
+    dropout: Dropout,
+}
+
+impl TransformerEncoderLayer {
+    pub fn new(
+        embed_dim: usize,
+        num_heads: usize,
+        ff_dim: usize,
+        dropout_rate: f32,
+    ) -> Result<Self, BellandeError> {
+        let mut ff_network = Sequential::new();
+        ff_network.add(Box::new(Linear::new(embed_dim, ff_dim, true)));
+        ff_network.add(Box::new(ReLU::new()));
+        ff_network.add(Box::new(Linear::new(ff_dim, embed_dim, true)));
+
+        Ok(TransformerEncoderLayer {
+            self_attn: MultiHeadAttention::new(embed_dim, num_heads, dropout_rate)?,
+            ff_network,
+            norm1: LayerNorm::new(vec![embed_dim], 1e-5, true),
+            norm2: LayerNorm::new(vec![embed_dim], 1e-5, true),
+            dropout: Dropout::new(dropout_rate)?,
+        })
+    }
+
+    pub fn forward(
+        &mut self,
+        src: &Tensor,
+        src_mask: Option<&Tensor>,
+    ) -> Result<Tensor, BellandeError> {
+        // Self attention block
+        let residual = src.clone();
+        let mut output = self.norm1.forward(src)?;
+        output = self
+            .self_attn
+            .forward(&output, &output, &output, src_mask)?;
+        output = NeuralLayer::forward(&mut self.dropout, &output)?;
+        output = output.add(&residual)?;
+
+        // Feed forward block
+        let residual = output.clone();
+        output = self.norm2.forward(&output)?;
+        output = self.ff_network.forward(&output)?;
+        output = NeuralLayer::forward(&mut self.dropout, &output)?;
+        output = output.add(&residual)?;
+
+        Ok(output)
+    }
+}
+
+pub struct TransformerDecoderLayer {
+    self_attn: MultiHeadAttention,
+    cross_attn: MultiHeadAttention,
+    ff_network: Sequential,
+    norm1: LayerNorm,
+    norm2: LayerNorm,
+    norm3: LayerNorm,
+    dropout: Dropout,
+}
+
+impl TransformerDecoderLayer {
+    pub fn new(
+        embed_dim: usize,
+        num_heads: usize,
+        ff_dim: usize,
+        dropout_rate: f32,
+    ) -> Result<Self, BellandeError> {
+        let mut ff_network = Sequential::new();
+        ff_network.add(Box::new(Linear::new(embed_dim, ff_dim, true)));
+        ff_network.add(Box::new(ReLU::new()));
+        ff_network.add(Box::new(Linear::new(ff_dim, embed_dim, true)));
+
+        Ok(TransformerDecoderLayer {
+            self_attn: MultiHeadAttention::new(embed_dim, num_heads, dropout_rate)?,
+            cross_attn: MultiHeadAttention::new(embed_dim, num_heads, dropout_rate)?,
+            ff_network,
+            norm1: LayerNorm::new(vec![embed_dim], 1e-5, true),
+            norm2: LayerNorm::new(vec![embed_dim], 1e-5, true),
+            norm3: LayerNorm::new(vec![embed_dim], 1e-5, true),
+            dropout: Dropout::new(dropout_rate)?,
+        })
+    }
+
+    pub fn forward(
+        &mut self,
+        tgt: &Tensor,
+        memory: &Tensor,
+        tgt_mask: Option<&Tensor>,
+        memory_mask: Option<&Tensor>,
+    ) -> Result<Tensor, BellandeError> {
+        // Self attention block
+        let residual = tgt.clone();
+        let mut output = self.norm1.forward(tgt)?;
+        output = self
+            .self_attn
+            .forward(&output, &output, &output, tgt_mask)?;
+        output = NeuralLayer::forward(&mut self.dropout, &output)?;
+        output = output.add(&residual)?;
+
+        // Cross attention block
+        let residual = output.clone();
+        output = self.norm2.forward(&output)?;
+        output = self
+            .cross_attn
+            .forward(&output, memory, memory, memory_mask)?;
+        output = NeuralLayer::forward(&mut self.dropout, &output)?;
+        output = output.add(&residual)?;
+
+        // Feed forward block
+        let residual = output.clone();
+        output = self.norm3.forward(&output)?;
+        output = self.ff_network.forward(&output)?;
+        output = NeuralLayer::forward(&mut self.dropout, &output)?;
+        output = output.add(&residual)?;
+
+        Ok(output)
+    }
+}
+
+// Implement thread safety
+unsafe impl Send for MultiHeadAttention {}
+unsafe impl Sync for MultiHeadAttention {}
+unsafe impl Send for TransformerEncoderLayer {}
+unsafe impl Sync for TransformerEncoderLayer {}
+unsafe impl Send for TransformerDecoderLayer {}
+unsafe impl Sync for TransformerDecoderLayer {}
diff --git a/src/loss/bce.rs b/src/loss/bce.rs
new file mode 100644
index 0000000..c194a49
--- /dev/null
+++ b/src/loss/bce.rs
@@ -0,0 +1,445 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::loss::Loss;
+use std::f32;
+
+#[derive(Debug, Clone, Copy)]
+pub enum ReductionType {
+    Sum,
+    Mean,
+    Max,
+    Min,
+    Product,
+}
+
+#[derive(Clone, Copy)]
+pub enum Reduction {
+    None,
+    Mean,
+    Sum,
+}
+
+pub trait ReductionOps {
+    fn reduce(&self, input: &Tensor) -> Result<Tensor, BellandeError>;
+    fn reduce_backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError>;
+}
+
+#[derive(Debug)]
+pub struct ReductionOperation {
+    reduction_type: ReductionType,
+    dim: Option<usize>,
+    keepdim: bool,
+    input_cache: Option<ReductionCache>,
+}
+
+pub struct BCELoss {
+    reduction: Reduction,
+    weight: Option<Tensor>,
+    eps: f32,
+}
+
+#[derive(Debug)]
+struct ReductionCache {
+    input: Tensor,
+    indices: Option<Vec<usize>>,
+}
+
+impl BCELoss {
+    pub fn new(reduction: Reduction, weight: Option<Tensor>) -> Self {
+        BCELoss {
+            reduction,
+            weight,
+            eps: 1e-8,
+        }
+    }
+}
+
+// Implement Loss trait for BCELoss
+impl Loss for BCELoss {
+    fn forward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        if prediction.shape != target.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut loss = Vec::with_capacity(prediction.data.len());
+        for (pred, tgt) in prediction.data.iter().zip(target.data.iter()) {
+            let p = pred.clamp(self.eps, 1.0 - self.eps);
+            let l = -tgt * p.ln() - (1.0 - tgt) * (1.0 - p).ln();
+            if let Some(ref weight) = self.weight {
+                loss.push(l * weight.data[0]);
+            } else {
+                loss.push(l);
+            }
+        }
+
+        match self.reduction {
+            Reduction::None => Ok(Tensor::new(
+                loss,
+                prediction.shape.clone(),
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+            Reduction::Mean => Ok(Tensor::new(
+                vec![loss.iter().sum::<f32>() / loss.len() as f32],
+                vec![1],
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+            Reduction::Sum => Ok(Tensor::new(
+                vec![loss.iter().sum()],
+                vec![1],
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+        }
+    }
+
+    fn backward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        if prediction.shape != target.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut grad = Vec::with_capacity(prediction.data.len());
+        for (pred, tgt) in prediction.data.iter().zip(target.data.iter()) {
+            let p = pred.clamp(self.eps, 1.0 - self.eps);
+            let mut g = (p - tgt) / (p * (1.0 - p));
+
+            if let Some(ref weight) = self.weight {
+                g *= weight.data[0];
+            }
+
+            grad.push(g);
+        }
+
+        let grad = match self.reduction {
+            Reduction::None => grad,
+            Reduction::Mean => {
+                let scale = 1.0 / prediction.data.len() as f32;
+                grad.iter().map(|&g| g * scale).collect()
+            }
+            Reduction::Sum => grad,
+        };
+
+        Ok(Tensor::new(
+            grad,
+            prediction.shape.clone(),
+            true,
+            prediction.device.clone(),
+            prediction.dtype,
+        ))
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for BCELoss {}
+unsafe impl Sync for BCELoss {}
+
+impl ReductionOperation {
+    pub fn new(reduction_type: ReductionType, dim: Option<usize>, keepdim: bool) -> Self {
+        ReductionOperation {
+            reduction_type,
+            dim,
+            keepdim,
+            input_cache: None,
+        }
+    }
+
+    pub fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let (output_data, output_shape, indices) = match self.dim {
+            Some(dim) => self.reduce_along_dim(input, dim)?,
+            None => self.reduce_all(input)?,
+        };
+
+        self.input_cache = Some(ReductionCache {
+            input: input.clone(),
+            indices,
+        });
+
+        Ok(Tensor::new(
+            output_data,
+            output_shape,
+            input.requires_grad,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    pub fn backward(&self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        if let Some(ref cache) = self.input_cache {
+            let input_shape = cache.input.shape.clone();
+            let mut grad_input = vec![0.0; cache.input.data.len()];
+
+            match self.reduction_type {
+                ReductionType::Sum => {
+                    self.backward_sum(&mut grad_input, grad_output, &input_shape)?;
+                }
+                ReductionType::Mean => {
+                    self.backward_mean(&mut grad_input, grad_output, &input_shape)?;
+                }
+                ReductionType::Max | ReductionType::Min => {
+                    self.backward_max_min(
+                        &mut grad_input,
+                        grad_output,
+                        &cache.indices.clone().unwrap(),
+                    )?;
+                }
+                ReductionType::Product => {
+                    self.backward_product(&mut grad_input, grad_output, &cache.input)?;
+                }
+            }
+
+            Ok(Tensor::new(
+                grad_input,
+                input_shape,
+                true,
+                grad_output.device.clone(),
+                grad_output.dtype,
+            ))
+        } else {
+            Err(BellandeError::RuntimeError(
+                "Forward pass not called".into(),
+            ))
+        }
+    }
+
+    fn reduce_along_dim(
+        &self,
+        input: &Tensor,
+        dim: usize,
+    ) -> Result<(Vec<f32>, Vec<usize>, Option<Vec<usize>>), BellandeError> {
+        if dim >= input.shape.len() {
+            return Err(BellandeError::RuntimeError(format!(
+                "Dimension {} out of bounds for tensor of shape {:?}",
+                dim, input.shape
+            )));
+        }
+
+        let mut output_shape = input.shape.clone();
+        if !self.keepdim {
+            output_shape.remove(dim);
+        } else {
+            output_shape[dim] = 1;
+        }
+
+        let stride = input.shape[dim];
+        let outer_size: usize = input.shape[..dim].iter().product();
+        let inner_size: usize = input.shape[dim + 1..].iter().product();
+
+        let mut output = Vec::new();
+        let mut indices = if matches!(self.reduction_type, ReductionType::Max | ReductionType::Min)
+        {
+            Some(Vec::new())
+        } else {
+            None
+        };
+
+        for outer in 0..outer_size {
+            for inner in 0..inner_size {
+                let mut values = Vec::with_capacity(stride);
+                for s in 0..stride {
+                    let idx = (outer * stride + s) * inner_size + inner;
+                    values.push(input.data[idx]);
+                }
+
+                let (result, index) = match self.reduction_type {
+                    ReductionType::Sum => (values.iter().sum(), None),
+                    ReductionType::Mean => (values.iter().sum::<f32>() / stride as f32, None),
+                    ReductionType::Max => {
+                        let (max_idx, &max_val) = values
+                            .iter()
+                            .enumerate()
+                            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                            .unwrap();
+                        (max_val, Some(max_idx))
+                    }
+                    ReductionType::Min => {
+                        let (min_idx, &min_val) = values
+                            .iter()
+                            .enumerate()
+                            .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                            .unwrap();
+                        (min_val, Some(min_idx))
+                    }
+                    ReductionType::Product => (values.iter().product(), None),
+                };
+
+                output.push(result);
+                if let Some(ref mut indices_vec) = indices {
+                    if let Some(idx) = index {
+                        indices_vec.push(idx);
+                    }
+                }
+            }
+        }
+
+        Ok((output, output_shape, indices))
+    }
+
+    fn reduce_all(
+        &self,
+        input: &Tensor,
+    ) -> Result<(Vec<f32>, Vec<usize>, Option<Vec<usize>>), BellandeError> {
+        let output_shape = if self.keepdim {
+            vec![1; input.shape.len()]
+        } else {
+            vec![1]
+        };
+
+        let (result, indices) = match self.reduction_type {
+            ReductionType::Sum => (vec![input.data.iter().sum()], None),
+            ReductionType::Mean => (
+                vec![input.data.iter().sum::<f32>() / input.data.len() as f32],
+                None,
+            ),
+            ReductionType::Max => {
+                let (max_idx, &max_val) = input
+                    .data
+                    .iter()
+                    .enumerate()
+                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                    .unwrap();
+                (vec![max_val], Some(vec![max_idx]))
+            }
+            ReductionType::Min => {
+                let (min_idx, &min_val) = input
+                    .data
+                    .iter()
+                    .enumerate()
+                    .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                    .unwrap();
+                (vec![min_val], Some(vec![min_idx]))
+            }
+            ReductionType::Product => (vec![input.data.iter().product()], None),
+        };
+
+        Ok((result, output_shape, indices))
+    }
+
+    fn backward_sum(
+        &self,
+        grad_input: &mut [f32],
+        grad_output: &Tensor,
+        input_shape: &[usize],
+    ) -> Result<(), BellandeError> {
+        match self.dim {
+            Some(dim) => {
+                let stride = input_shape[dim];
+                let outer_size: usize = input_shape[..dim].iter().product();
+                let inner_size: usize = input_shape[dim + 1..].iter().product();
+
+                for outer in 0..outer_size {
+                    for inner in 0..inner_size {
+                        let grad = grad_output.data[(outer * inner_size + inner)];
+                        for s in 0..stride {
+                            let idx = (outer * stride + s) * inner_size + inner;
+                            grad_input[idx] = grad;
+                        }
+                    }
+                }
+            }
+            None => {
+                let grad = grad_output.data[0];
+                grad_input.iter_mut().for_each(|x| *x = grad);
+            }
+        }
+        Ok(())
+    }
+
+    fn backward_mean(
+        &self,
+        grad_input: &mut [f32],
+        grad_output: &Tensor,
+        input_shape: &[usize],
+    ) -> Result<(), BellandeError> {
+        match self.dim {
+            Some(dim) => {
+                let stride = input_shape[dim] as f32;
+                let outer_size: usize = input_shape[..dim].iter().product();
+                let inner_size: usize = input_shape[dim + 1..].iter().product();
+
+                for outer in 0..outer_size {
+                    for inner in 0..inner_size {
+                        let grad = grad_output.data[(outer * inner_size + inner)] / stride;
+                        for s in 0..input_shape[dim] {
+                            let idx = (outer * input_shape[dim] + s) * inner_size + inner;
+                            grad_input[idx] = grad;
+                        }
+                    }
+                }
+            }
+            None => {
+                let grad = grad_output.data[0] / grad_input.len() as f32;
+                grad_input.iter_mut().for_each(|x| *x = grad);
+            }
+        }
+        Ok(())
+    }
+
+    fn backward_max_min(
+        &self,
+        grad_input: &mut [f32],
+        grad_output: &Tensor,
+        indices: &[usize],
+    ) -> Result<(), BellandeError> {
+        for (idx, &grad) in indices.iter().zip(grad_output.data.iter()) {
+            grad_input[*idx] = grad;
+        }
+        Ok(())
+    }
+
+    fn backward_product(
+        &self,
+        grad_input: &mut [f32],
+        grad_output: &Tensor,
+        input: &Tensor,
+    ) -> Result<(), BellandeError> {
+        match self.dim {
+            Some(dim) => {
+                let stride = input.shape[dim];
+                let outer_size: usize = input.shape[..dim].iter().product();
+                let inner_size: usize = input.shape[dim + 1..].iter().product();
+
+                for outer in 0..outer_size {
+                    for inner in 0..inner_size {
+                        let mut product = 1.0;
+                        for s in 0..stride {
+                            let idx = (outer * stride + s) * inner_size + inner;
+                            product *= input.data[idx];
+                        }
+
+                        let grad = grad_output.data[(outer * inner_size + inner)];
+                        for s in 0..stride {
+                            let idx = (outer * stride + s) * inner_size + inner;
+                            grad_input[idx] = grad * product / input.data[idx];
+                        }
+                    }
+                }
+            }
+            None => {
+                let product: f32 = input.data.iter().product();
+                let grad = grad_output.data[0];
+                for (i, &val) in input.data.iter().enumerate() {
+                    grad_input[i] = grad * product / val;
+                }
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/src/loss/cross_entropy.rs b/src/loss/cross_entropy.rs
new file mode 100644
index 0000000..8b1df94
--- /dev/null
+++ b/src/loss/cross_entropy.rs
@@ -0,0 +1,309 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::loss::bce::Reduction;
+use crate::loss::Loss;
+
+/// Cross Entropy Loss implementation with support for class weights and ignored indices
+pub struct CrossEntropyLoss {
+    reduction: Reduction,
+    weight: Option<Tensor>,
+    ignore_index: Option<i64>,
+}
+
+impl CrossEntropyLoss {
+    pub fn new(reduction: Reduction, weight: Option<Tensor>, ignore_index: Option<i64>) -> Self {
+        CrossEntropyLoss {
+            reduction,
+            weight,
+            ignore_index,
+        }
+    }
+
+    pub fn default() -> Self {
+        CrossEntropyLoss {
+            reduction: Reduction::Mean,
+            weight: None,
+            ignore_index: None,
+        }
+    }
+
+    fn validate_input(&self, prediction: &Tensor, target: &Tensor) -> Result<(), BellandeError> {
+        if prediction.shape.len() != 2 {
+            return Err(BellandeError::InvalidInputs(
+                "Prediction tensor must be 2-dimensional (batch_size, num_classes)".to_string(),
+            ));
+        }
+
+        if target.shape.len() != 1 {
+            return Err(BellandeError::InvalidInputs(
+                "Target tensor must be 1-dimensional (batch_size)".to_string(),
+            ));
+        }
+
+        if prediction.shape[0] != target.shape[0] {
+            return Err(BellandeError::ShapeMismatch(
+                "Batch sizes of prediction and target must match".to_string(),
+            ));
+        }
+
+        Ok(())
+    }
+
+    fn compute_log_softmax(&self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let batch_size = input.shape[0];
+        let num_classes = input.shape[1];
+
+        // Find max values for numerical stability
+        let mut max_vals = vec![f32::NEG_INFINITY; batch_size];
+        for b in 0..batch_size {
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                max_vals[b] = max_vals[b].max(input.data[idx]);
+            }
+        }
+
+        // Compute exp(x - max) and sum
+        let mut exp_sum = vec![0.0; batch_size];
+        let mut shifted = vec![0.0; input.data.len()];
+
+        for b in 0..batch_size {
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                shifted[idx] = (input.data[idx] - max_vals[b]).exp();
+                exp_sum[b] += shifted[idx];
+            }
+        }
+
+        // Compute log_softmax
+        let mut output = vec![0.0; input.data.len()];
+        for b in 0..batch_size {
+            let log_sum = exp_sum[b].ln();
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                output[idx] = input.data[idx] - max_vals[b] - log_sum;
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn compute_softmax(&self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let batch_size = input.shape[0];
+        let num_classes = input.shape[1];
+
+        // Find max values for numerical stability
+        let mut max_vals = vec![f32::NEG_INFINITY; batch_size];
+        for b in 0..batch_size {
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                max_vals[b] = max_vals[b].max(input.data[idx]);
+            }
+        }
+
+        // Compute exp(x - max) and sum
+        let mut exp_sum = vec![0.0; batch_size];
+        let mut output = vec![0.0; input.data.len()];
+
+        for b in 0..batch_size {
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                output[idx] = (input.data[idx] - max_vals[b]).exp();
+                exp_sum[b] += output[idx];
+            }
+        }
+
+        // Normalize
+        for b in 0..batch_size {
+            for c in 0..num_classes {
+                let idx = b * num_classes + c;
+                output[idx] /= exp_sum[b];
+            }
+        }
+
+        Ok(Tensor::new(
+            output,
+            input.shape.clone(),
+            true,
+            input.device.clone(),
+            input.dtype,
+        ))
+    }
+
+    fn convert_to_one_hot(
+        &self,
+        target: &Tensor,
+        num_classes: usize,
+    ) -> Result<Tensor, BellandeError> {
+        let batch_size = target.shape[0];
+        let mut one_hot = vec![0.0; batch_size * num_classes];
+
+        for i in 0..batch_size {
+            let target_idx = target.data[i] as usize;
+            if target_idx >= num_classes {
+                return Err(BellandeError::InvalidInputs(format!(
+                    "Target class {} is out of range (0, {})",
+                    target_idx,
+                    num_classes - 1
+                )));
+            }
+            one_hot[i * num_classes + target_idx] = 1.0;
+        }
+
+        Ok(Tensor::new(
+            one_hot,
+            vec![batch_size, num_classes],
+            true,
+            target.device.clone(),
+            target.dtype,
+        ))
+    }
+
+    fn element_wise_multiply(&self, a: &Tensor, b: &Tensor) -> Result<Tensor, BellandeError> {
+        if a.shape != b.shape {
+            return Err(BellandeError::ShapeMismatch(
+                "Tensor shapes must match for multiplication".into(),
+            ));
+        }
+
+        let output: Vec<f32> = a
+            .data
+            .iter()
+            .zip(b.data.iter())
+            .map(|(&x, &y)| x * y)
+            .collect();
+
+        Ok(Tensor::new(
+            output,
+            a.shape.clone(),
+            true,
+            a.device.clone(),
+            a.dtype,
+        ))
+    }
+}
+
+impl Loss for CrossEntropyLoss {
+    fn forward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        self.validate_input(prediction, target)?;
+
+        let num_classes = prediction.shape[1];
+        let log_softmax = self.compute_log_softmax(prediction)?;
+        let target_one_hot = self.convert_to_one_hot(target, num_classes)?;
+
+        // Compute negative log likelihood
+        let mut loss = self.element_wise_multiply(&target_one_hot, &log_softmax)?;
+        loss.data.iter_mut().for_each(|x| *x = -*x);
+
+        // Apply class weights if provided
+        if let Some(ref weight) = self.weight {
+            loss = self.element_wise_multiply(&loss, weight)?;
+        }
+
+        // Apply ignore index masking if specified
+        if let Some(ignore_idx) = self.ignore_index {
+            for i in 0..target.shape[0] {
+                if target.data[i] as i64 == ignore_idx {
+                    for j in 0..loss.shape[1] {
+                        loss.data[i * loss.shape[1] + j] = 0.0;
+                    }
+                }
+            }
+        }
+
+        // Apply reduction
+        match self.reduction {
+            Reduction::Mean => {
+                let sum: f32 = loss.data.iter().sum();
+                let mean = sum / (loss.data.len() as f32);
+                Ok(Tensor::new(
+                    vec![mean],
+                    vec![1],
+                    true,
+                    loss.device,
+                    loss.dtype,
+                ))
+            }
+            Reduction::Sum => {
+                let sum: f32 = loss.data.iter().sum();
+                Ok(Tensor::new(
+                    vec![sum],
+                    vec![1],
+                    true,
+                    loss.device,
+                    loss.dtype,
+                ))
+            }
+            Reduction::None => Ok(loss),
+        }
+    }
+
+    fn backward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        let softmax = self.compute_softmax(prediction)?;
+        let num_classes = prediction.shape[1];
+        let target_one_hot = self.convert_to_one_hot(target, num_classes)?;
+
+        // Compute gradients
+        let mut grad_output = vec![0.0; softmax.data.len()];
+        for i in 0..softmax.data.len() {
+            grad_output[i] = softmax.data[i] - target_one_hot.data[i];
+        }
+
+        let mut grad = Tensor::new(
+            grad_output,
+            softmax.shape,
+            true,
+            softmax.device,
+            softmax.dtype,
+        );
+
+        // Apply class weights if provided
+        if let Some(ref weight) = self.weight {
+            grad = self.element_wise_multiply(&grad, weight)?;
+        }
+
+        // Apply ignore index masking if specified
+        if let Some(ignore_idx) = self.ignore_index {
+            for i in 0..target.shape[0] {
+                if target.data[i] as i64 == ignore_idx {
+                    for j in 0..grad.shape[1] {
+                        grad.data[i * grad.shape[1] + j] = 0.0;
+                    }
+                }
+            }
+        }
+
+        // Apply reduction
+        match self.reduction {
+            Reduction::Mean => {
+                let batch_size = prediction.shape[0] as f32;
+                grad.data.iter_mut().for_each(|x| *x /= batch_size);
+                Ok(grad)
+            }
+            _ => Ok(grad),
+        }
+    }
+}
+
+// Implement thread safety
+unsafe impl Send for CrossEntropyLoss {}
+unsafe impl Sync for CrossEntropyLoss {}
diff --git a/src/loss/custom.rs b/src/loss/custom.rs
new file mode 100644
index 0000000..10b7da6
--- /dev/null
+++ b/src/loss/custom.rs
@@ -0,0 +1,60 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::loss::bce::Reduction;
+
+pub trait CustomLossFunction {
+    fn compute(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError>;
+}
+
+pub struct CustomLoss {
+    loss_fn: Box<dyn CustomLossFunction>,
+    reduction: Reduction,
+}
+
+impl CustomLoss {
+    pub fn new(loss_fn: Box<dyn CustomLossFunction>, reduction: Reduction) -> Self {
+        CustomLoss { loss_fn, reduction }
+    }
+
+    pub fn forward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        let loss = self.loss_fn.compute(prediction, target)?;
+
+        match self.reduction {
+            Reduction::None => Ok(loss),
+            Reduction::Mean => {
+                let mean = loss.data.iter().sum::<f32>() / loss.data.len() as f32;
+                Ok(Tensor::new(
+                    vec![mean],
+                    vec![1],
+                    true,
+                    loss.device,
+                    loss.dtype,
+                ))
+            }
+            Reduction::Sum => {
+                let sum = loss.data.iter().sum::<f32>();
+                Ok(Tensor::new(
+                    vec![sum],
+                    vec![1],
+                    true,
+                    loss.device,
+                    loss.dtype,
+                ))
+            }
+        }
+    }
+}
diff --git a/src/loss/mod.rs b/src/loss/mod.rs
new file mode 100644
index 0000000..74982c8
--- /dev/null
+++ b/src/loss/mod.rs
@@ -0,0 +1,158 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+pub mod bce;
+pub mod cross_entropy;
+pub mod custom;
+pub mod mse;
+
+/// The Loss trait defines the interface for loss functions used in training neural networks.
+pub trait Loss: Send + Sync {
+    fn forward(&self, output: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError>;
+    fn backward(&self, output: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError>;
+
+    /// Optional method to get the name of the loss function
+    fn name(&self) -> &str {
+        "GenericLoss"
+    }
+
+    /// Optional method to get the reduction method used by the loss function
+    fn reduction(&self) -> Reduction {
+        Reduction::Mean
+    }
+}
+
+/// Enumeration of possible reduction methods for loss functions
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum Reduction {
+    None,
+    Mean,
+    Sum,
+}
+
+pub trait StaticLoss: Loss + 'static {}
+impl<T: Loss + 'static> StaticLoss for T {}
+
+pub trait LossInit: Loss {
+    fn new() -> Self;
+    fn new_with_reduction(reduction: Reduction) -> Self;
+}
+
+pub trait WeightedLoss: Loss {
+    /// Computes the forward pass with sample weights
+    fn forward_weighted(
+        &self,
+        output: &Tensor,
+        target: &Tensor,
+        weights: &Tensor,
+    ) -> Result<Tensor, BellandeError>;
+
+    /// Computes the backward pass with sample weights
+    fn backward_weighted(
+        &self,
+        output: &Tensor,
+        target: &Tensor,
+        weights: &Tensor,
+    ) -> Result<Tensor, BellandeError>;
+}
+
+pub trait ClassWeightedLoss: Loss {
+    fn set_class_weights(&mut self, weights: Tensor) -> Result<(), BellandeError>;
+    fn get_class_weights(&self) -> Option<&Tensor>;
+}
+
+pub mod utils {
+    use super::*;
+
+    /// Validates input shapes for loss computation
+    pub fn validate_shapes(output: &Tensor, target: &Tensor) -> Result<(), BellandeError> {
+        if output.shape != target.shape {
+            return Err(BellandeError::ShapeMismatch(format!(
+                "Output shape {:?} doesn't match target shape {:?}",
+                output.shape, target.shape
+            )));
+        }
+        Ok(())
+    }
+
+    /// Applies reduction method to loss values
+    pub fn apply_reduction(loss: &Tensor, reduction: Reduction) -> Result<Tensor, BellandeError> {
+        let result = match reduction {
+            Reduction::None => Ok(loss.clone()),
+            Reduction::Mean => {
+                let sum: f32 = loss.data.iter().sum();
+                let mean = sum / (loss.data.len() as f32);
+                Ok(Tensor::new(
+                    vec![mean],
+                    vec![1],
+                    loss.requires_grad,
+                    loss.device.clone(),
+                    loss.dtype,
+                ))
+            }
+            Reduction::Sum => {
+                let sum: f32 = loss.data.iter().sum();
+                Ok(Tensor::new(
+                    vec![sum],
+                    vec![1],
+                    loss.requires_grad,
+                    loss.device.clone(),
+                    loss.dtype,
+                ))
+            }
+        };
+        result
+    }
+
+    /// Compute element-wise loss without reduction
+    pub fn compute_elementwise_loss(
+        output: &Tensor,
+        target: &Tensor,
+        op: impl Fn(f32, f32) -> f32,
+    ) -> Result<Tensor, BellandeError> {
+        validate_shapes(output, target)?;
+
+        let loss_data: Vec<f32> = output
+            .data
+            .iter()
+            .zip(target.data.iter())
+            .map(|(&o, &t)| op(o, t))
+            .collect();
+
+        Ok(Tensor::new(
+            loss_data,
+            output.shape.clone(),
+            output.requires_grad,
+            output.device.clone(),
+            output.dtype,
+        ))
+    }
+
+    /// Apply weights to loss values
+    pub fn apply_weights(loss: &mut Tensor, weights: &Tensor) -> Result<(), BellandeError> {
+        if loss.shape != weights.shape {
+            return Err(BellandeError::ShapeMismatch(
+                "Weights shape doesn't match loss shape".into(),
+            ));
+        }
+
+        for (l, &w) in loss.data.iter_mut().zip(weights.data.iter()) {
+            *l *= w;
+        }
+        Ok(())
+    }
+}
diff --git a/src/loss/mse.rs b/src/loss/mse.rs
new file mode 100644
index 0000000..aee34d8
--- /dev/null
+++ b/src/loss/mse.rs
@@ -0,0 +1,99 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::loss::{bce::Reduction, Loss};
+
+pub struct MSELoss {
+    reduction: Reduction,
+}
+
+impl MSELoss {
+    pub fn new(reduction: Reduction) -> Self {
+        MSELoss { reduction }
+    }
+}
+
+// Implement Loss trait for MSELoss
+impl Loss for MSELoss {
+    fn forward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        if prediction.shape != target.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut loss = Vec::with_capacity(prediction.data.len());
+        for (pred, tgt) in prediction.data.iter().zip(target.data.iter()) {
+            loss.push((pred - tgt).powi(2));
+        }
+
+        match self.reduction {
+            Reduction::None => Ok(Tensor::new(
+                loss,
+                prediction.shape.clone(),
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+            Reduction::Mean => Ok(Tensor::new(
+                vec![loss.iter().sum::<f32>() / loss.len() as f32],
+                vec![1],
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+            Reduction::Sum => Ok(Tensor::new(
+                vec![loss.iter().sum()],
+                vec![1],
+                true,
+                prediction.device.clone(),
+                prediction.dtype,
+            )),
+        }
+    }
+
+    fn backward(&self, prediction: &Tensor, target: &Tensor) -> Result<Tensor, BellandeError> {
+        if prediction.shape != target.shape {
+            return Err(BellandeError::DimensionMismatch);
+        }
+
+        let mut grad = Vec::with_capacity(prediction.data.len());
+        for (pred, tgt) in prediction.data.iter().zip(target.data.iter()) {
+            // Derivative of (pred - tgt)^2 is 2(pred - tgt)
+            grad.push(2.0 * (pred - tgt));
+        }
+
+        let grad = match self.reduction {
+            Reduction::None => grad,
+            Reduction::Mean => {
+                // Scale gradients by 1/N for mean reduction
+                let scale = 1.0 / prediction.data.len() as f32;
+                grad.iter().map(|&g| g * scale).collect()
+            }
+            Reduction::Sum => grad,
+        };
+
+        Ok(Tensor::new(
+            grad,
+            prediction.shape.clone(),
+            true,
+            prediction.device.clone(),
+            prediction.dtype,
+        ))
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for MSELoss {}
+unsafe impl Sync for MSELoss {}
diff --git a/src/metrics/metrics.rs b/src/metrics/metrics.rs
new file mode 100644
index 0000000..a74856c
--- /dev/null
+++ b/src/metrics/metrics.rs
@@ -0,0 +1,74 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::tensor::Tensor;
+
+pub trait Metric {
+    fn reset(&mut self);
+    fn update(&mut self, prediction: &Tensor, target: &Tensor);
+    fn compute(&self) -> f32;
+    fn name(&self) -> &str;
+}
+
+pub struct Accuracy {
+    correct: usize,
+    total: usize,
+}
+
+impl Accuracy {
+    pub fn new() -> Self {
+        Accuracy {
+            correct: 0,
+            total: 0,
+        }
+    }
+}
+
+impl Metric for Accuracy {
+    fn reset(&mut self) {
+        self.correct = 0;
+        self.total = 0;
+    }
+
+    fn update(&mut self, prediction: &Tensor, target: &Tensor) {
+        let pred_classes: Vec<usize> = prediction
+            .data
+            .chunks(prediction.shape[1])
+            .map(|chunk| {
+                chunk
+                    .iter()
+                    .enumerate()
+                    .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+                    .unwrap()
+                    .0
+            })
+            .collect();
+
+        for (pred, &true_class) in pred_classes.iter().zip(target.data.iter()) {
+            if *pred == true_class as usize {
+                self.correct += 1;
+            }
+            self.total += 1;
+        }
+    }
+
+    fn compute(&self) -> f32 {
+        self.correct as f32 / self.total as f32
+    }
+
+    fn name(&self) -> &str {
+        "accuracy"
+    }
+}
diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs
new file mode 100644
index 0000000..e144883
--- /dev/null
+++ b/src/metrics/mod.rs
@@ -0,0 +1 @@
+pub mod metrics;
diff --git a/src/models/custom.rs b/src/models/custom.rs
new file mode 100644
index 0000000..08eacbc
--- /dev/null
+++ b/src/models/custom.rs
@@ -0,0 +1,248 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::layer::batch_norm::BatchNorm1d;
+use crate::layer::dropout::Dropout;
+use crate::layer::{activation::ReLU, linear::Linear};
+use crate::models::sequential::Sequential;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fs::{create_dir_all, File};
+use std::io::{Read, Write};
+use std::path::Path;
+
+pub trait ModelBuilder {
+    fn build(&self, config: &ModelConfig) -> Result<Box<dyn Model>, BellandeError>;
+}
+
+#[derive(Clone, Serialize, Deserialize)]
+pub struct ModelConfig {
+    pub input_shape: Vec<usize>,
+    pub num_classes: usize,
+    pub hyperparameters: HashMap<String, f32>,
+}
+
+pub trait Model {
+    fn forward(&mut self, x: &Tensor) -> Result<Tensor, BellandeError>;
+    fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError>;
+    fn parameters(&self) -> Vec<Tensor>;
+    fn train(&mut self);
+    fn eval(&mut self);
+    fn save(&self, path: &str) -> Result<(), BellandeError>;
+    fn load(&mut self, path: &str) -> Result<(), BellandeError>;
+}
+
+// Remove derive for Sequential
+pub struct CustomModel {
+    layers: Sequential,
+    config: ModelConfig,
+    training: bool,
+}
+
+#[derive(Serialize, Deserialize)]
+struct ModelState {
+    config: ModelConfig,
+    parameters: Vec<Vec<f32>>,
+    parameter_shapes: Vec<Vec<usize>>,
+}
+
+impl CustomModel {
+    pub fn new(config: ModelConfig) -> Self {
+        let mut layers = Sequential::new();
+        let input_size = config.input_shape.iter().product();
+        let hidden_size = *config.hyperparameters.get("hidden_size").unwrap_or(&128.0) as usize;
+
+        // Build the model architecture based on config
+        layers.add(Box::new(Linear::new(input_size, hidden_size, true)));
+        layers.add(Box::new(ReLU::new()));
+
+        if let Some(dropout_rate) = config.hyperparameters.get("dropout_rate") {
+            // Handle the Result from Dropout::new
+            if let Ok(dropout) = Dropout::new(*dropout_rate) {
+                layers.add(Box::new(dropout));
+            }
+        }
+
+        // Add batch normalization if specified
+        if config.hyperparameters.get("use_batch_norm").unwrap_or(&0.0) > &0.0 {
+            layers.add(Box::new(BatchNorm1d::new(hidden_size, 1e-5, 0.1, true)));
+        }
+
+        // Add additional layers based on depth parameter
+        let depth = *config.hyperparameters.get("depth").unwrap_or(&1.0) as usize;
+        for _ in 0..depth {
+            layers.add(Box::new(Linear::new(hidden_size, hidden_size, true)));
+            layers.add(Box::new(ReLU::new()));
+
+            if let Some(dropout_rate) = config.hyperparameters.get("dropout_rate") {
+                // Handle the Result from Dropout::new
+                if let Ok(dropout) = Dropout::new(*dropout_rate) {
+                    layers.add(Box::new(dropout));
+                }
+            }
+        }
+
+        // Output layer
+        layers.add(Box::new(Linear::new(hidden_size, config.num_classes, true)));
+
+        CustomModel {
+            layers,
+            config,
+            training: true,
+        }
+    }
+
+    pub fn get_config(&self) -> &ModelConfig {
+        &self.config
+    }
+
+    pub fn set_learning_rate(&mut self, lr: f32) {
+        self.config
+            .hyperparameters
+            .insert("learning_rate".to_string(), lr);
+    }
+
+    fn create_checkpoint_dir(&self, path: &str) -> Result<(), BellandeError> {
+        if let Some(parent) = Path::new(path).parent() {
+            create_dir_all(parent).map_err(|e| {
+                BellandeError::IOError(format!("Failed to create directory: {}", e).into())
+            })?;
+        }
+        Ok(())
+    }
+}
+
+impl Model for CustomModel {
+    fn forward(&mut self, x: &Tensor) -> Result<Tensor, BellandeError> {
+        if x.shape[1..] != self.config.input_shape[..] {
+            return Err(BellandeError::InvalidShape(format!(
+                "Expected input shape {:?}, got {:?}",
+                self.config.input_shape,
+                x.shape[1..].to_vec()
+            )));
+        }
+        self.layers.forward(x)
+    }
+
+    fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError> {
+        if !self.training {
+            return Err(BellandeError::InvalidOperation(
+                "Backward pass called while model is in evaluation mode".into(),
+            ));
+        }
+        self.layers.backward(grad)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        self.layers.parameters()
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+        self.layers.train();
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+        self.layers.eval();
+    }
+
+    fn save(&self, path: &str) -> Result<(), BellandeError> {
+        self.create_checkpoint_dir(path)?;
+
+        let parameters: Vec<Vec<f32>> = self
+            .parameters()
+            .iter()
+            .map(|tensor| tensor.data.clone())
+            .collect();
+
+        let parameter_shapes: Vec<Vec<usize>> = self
+            .parameters()
+            .iter()
+            .map(|tensor| tensor.shape.clone())
+            .collect();
+
+        let model_state = ModelState {
+            config: self.config.clone(),
+            parameters,
+            parameter_shapes,
+        };
+
+        let serialized = serde_json::to_string(&model_state).map_err(|e| {
+            BellandeError::SerializationError(format!("Failed to serialize model: {}", e))
+        })?;
+
+        let mut file = File::create(path)
+            .map_err(|e| BellandeError::IOError(format!("Failed to create file: {}", e)))?;
+
+        file.write_all(serialized.as_bytes())
+            .map_err(|e| BellandeError::IOError(format!("Failed to write to file: {}", e)))?;
+
+        Ok(())
+    }
+
+    fn load(&mut self, path: &str) -> Result<(), BellandeError> {
+        let mut file = File::open(path)
+            .map_err(|e| BellandeError::IOError(format!("Failed to open file: {}", e)))?;
+
+        let mut contents = String::new();
+        file.read_to_string(&mut contents)
+            .map_err(|e| BellandeError::IOError(format!("Failed to read file: {}", e)))?;
+
+        let model_state: ModelState = serde_json::from_str(&contents).map_err(|e| {
+            BellandeError::SerializationError(format!("Failed to deserialize model: {}", e))
+        })?;
+
+        // Verify configuration compatibility
+        if model_state.config.input_shape != self.config.input_shape {
+            return Err(BellandeError::InvalidConfiguration(
+                "Input shape mismatch".into(),
+            ));
+        }
+
+        if model_state.config.num_classes != self.config.num_classes {
+            return Err(BellandeError::InvalidConfiguration(
+                "Number of classes mismatch".into(),
+            ));
+        }
+
+        // Load parameters
+        let mut current_parameters = self.parameters();
+        if current_parameters.len() != model_state.parameters.len() {
+            return Err(BellandeError::InvalidConfiguration(
+                "Parameter count mismatch".into(),
+            ));
+        }
+
+        for (((param, saved_data), saved_shape), current_param) in current_parameters
+            .iter_mut()
+            .zip(model_state.parameters.iter())
+            .zip(model_state.parameter_shapes.iter())
+            .zip(self.parameters())
+        {
+            if saved_shape != &current_param.shape {
+                return Err(BellandeError::InvalidConfiguration(format!(
+                    "Parameter shape mismatch: expected {:?}, got {:?}",
+                    current_param.shape, saved_shape
+                )));
+            }
+            param.data = saved_data.clone();
+        }
+
+        self.config = model_state.config;
+        Ok(())
+    }
+}
diff --git a/src/models/mod.rs b/src/models/mod.rs
new file mode 100644
index 0000000..b1a3ffd
--- /dev/null
+++ b/src/models/mod.rs
@@ -0,0 +1,5 @@
+pub mod custom;
+pub mod models;
+pub mod resnet;
+pub mod sequential;
+pub mod vgg;
diff --git a/src/models/models.rs b/src/models/models.rs
new file mode 100644
index 0000000..d687655
--- /dev/null
+++ b/src/models/models.rs
@@ -0,0 +1,215 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, dtype::DataType, error::BellandeError, tensor::Tensor};
+use crate::models::sequential::Sequential;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Base model trait defining common functionality for neural networks
+pub trait Model: Send + Sync {
+    /// Forward pass through the model
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError>;
+
+    /// Backward pass through the model
+    fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError>;
+
+    /// Get model parameters
+    fn parameters(&self) -> Vec<Tensor>;
+
+    /// Set model to training mode
+    fn train(&mut self);
+
+    /// Set model to evaluation mode
+    fn eval(&mut self);
+
+    /// Save model to file
+    fn save(&self, path: &str) -> Result<(), BellandeError>;
+
+    /// Load model from file
+    fn load(&mut self, path: &str) -> Result<(), BellandeError>;
+
+    /// Get model state dictionary
+    fn state_dict(&self) -> HashMap<String, Tensor>;
+
+    /// Load model state dictionary
+    fn load_state_dict(&mut self, state_dict: HashMap<String, Tensor>)
+        -> Result<(), BellandeError>;
+}
+
+/// Model state for serialization
+#[derive(Serialize, Deserialize)]
+pub struct ModelState {
+    pub model_type: String,
+    pub state_dict: HashMap<String, Vec<f32>>,
+    pub shapes: HashMap<String, Vec<usize>>,
+    pub config: ModelConfig,
+}
+
+/// Model configuration
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ModelConfig {
+    pub input_shape: Vec<usize>,
+    pub num_classes: usize,
+    pub dropout_rate: f32,
+    pub hidden_layers: Vec<usize>,
+}
+
+impl Model for Sequential {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.layers.is_empty() {
+            return Err(BellandeError::InvalidInputs("Model has no layers".into()));
+        }
+
+        let mut current = input.clone();
+        for layer in &mut self.layers {
+            current = layer
+                .forward(&current)
+                .map_err(|e| BellandeError::RuntimeError(format!("Forward pass failed: {}", e)))?;
+        }
+        Ok(current)
+    }
+
+    fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError> {
+        if self.layers.is_empty() {
+            return Err(BellandeError::InvalidInputs("Model has no layers".into()));
+        }
+
+        if !self.training {
+            return Err(BellandeError::InvalidBackward(
+                "Model not in training mode".into(),
+            ));
+        }
+
+        let mut current_grad = grad.clone();
+        for layer in self.layers.iter_mut().rev() {
+            current_grad = layer
+                .backward(&current_grad)
+                .map_err(|e| BellandeError::RuntimeError(format!("Backward pass failed: {}", e)))?;
+        }
+        Ok(current_grad)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        self.layers
+            .iter()
+            .flat_map(|layer| layer.parameters())
+            .collect()
+    }
+
+    fn train(&mut self) {
+        self.training = true;
+        for layer in &mut self.layers {
+            layer.train();
+        }
+    }
+
+    fn eval(&mut self) {
+        self.training = false;
+        for layer in &mut self.layers {
+            layer.eval();
+        }
+    }
+
+    fn save(&self, path: &str) -> Result<(), BellandeError> {
+        let state = ModelState {
+            model_type: "Sequential".to_string(),
+            state_dict: self
+                .state_dict()
+                .into_iter()
+                .map(|(k, v)| (k, v.data))
+                .collect(),
+            shapes: self
+                .state_dict()
+                .into_iter()
+                .map(|(k, v)| (k, v.shape))
+                .collect(),
+            config: ModelConfig {
+                input_shape: vec![],
+                num_classes: 0,
+                dropout_rate: 0.0,
+                hidden_layers: vec![],
+            },
+        };
+
+        let file = std::fs::File::create(path)
+            .map_err(|e| BellandeError::IOError(format!("Error: {}", e)))?;
+        serde_json::to_writer(file, &state).map_err(|e| {
+            BellandeError::SerializationError(format!("Failed to serialize model state: {}", e))
+        })
+    }
+
+    fn load(&mut self, path: &str) -> Result<(), BellandeError> {
+        let file = std::fs::File::open(path)
+            .map_err(|e| BellandeError::IOError(format!("Error: {}", e)))?;
+
+        let state: ModelState = serde_json::from_reader(file).map_err(|e| {
+            BellandeError::SerializationError(format!("Failed to deserialize model state: {}", e))
+        })?;
+
+        let mut state_dict = HashMap::new();
+        for (key, data) in state.state_dict {
+            let shape = state.shapes.get(&key).ok_or_else(|| {
+                BellandeError::RuntimeError(format!("Missing shape for key: {}", key))
+            })?;
+
+            state_dict.insert(
+                key,
+                Tensor::new(data, shape.clone(), true, Device::CPU, DataType::Float32),
+            );
+        }
+
+        self.load_state_dict(state_dict)
+    }
+
+    fn state_dict(&self) -> HashMap<String, Tensor> {
+        let mut state_dict = HashMap::new();
+        for (i, layer) in self.layers.iter().enumerate() {
+            for (name, param) in layer.named_parameters() {
+                state_dict.insert(format!("layer_{}.{}", i, name), param);
+            }
+        }
+        state_dict
+    }
+
+    fn load_state_dict(
+        &mut self,
+        state_dict: HashMap<String, Tensor>,
+    ) -> Result<(), BellandeError> {
+        for (i, layer) in self.layers.iter_mut().enumerate() {
+            for (name, _) in layer.named_parameters() {
+                let key = format!("layer_{}.{}", i, name);
+                if let Some(param) = state_dict.get(&key) {
+                    layer.set_parameter(&name, param.clone()).map_err(|e| {
+                        BellandeError::RuntimeError(format!(
+                            "Failed to set parameter {}: {}",
+                            key, e
+                        ))
+                    })?;
+                } else {
+                    return Err(BellandeError::RuntimeError(format!(
+                        "Missing parameter: {}",
+                        key
+                    )));
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+// Thread safety implementations
+unsafe impl Send for Sequential {}
+unsafe impl Sync for Sequential {}
diff --git a/src/models/resnet.rs b/src/models/resnet.rs
new file mode 100644
index 0000000..50c7344
--- /dev/null
+++ b/src/models/resnet.rs
@@ -0,0 +1,190 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::layer::{
+    activation::{Activation, ReLU},
+    avgpool2d::AvgPool2d,
+    batch_norm::BatchNorm2d,
+    conv::Conv2d,
+    linear::Linear,
+    pooling::MaxPool2d,
+};
+use crate::models::sequential::{NeuralLayer, Sequential};
+
+pub struct ResidualBlock {
+    conv1: Conv2d,
+    bn1: BatchNorm2d,
+    conv2: Conv2d,
+    bn2: BatchNorm2d,
+    downsample: Option<Sequential>,
+    relu: ReLU,
+}
+
+impl ResidualBlock {
+    pub fn new(
+        in_channels: usize,
+        out_channels: usize,
+        stride: usize,
+        downsample: Option<Sequential>,
+    ) -> Self {
+        ResidualBlock {
+            conv1: Conv2d::new(
+                in_channels,
+                out_channels,
+                (3, 3),
+                Some((stride, stride)),
+                Some((1, 1)),
+                true,
+            ),
+            bn1: BatchNorm2d::new(out_channels, 1e-5, 0.1, true),
+            conv2: Conv2d::new(
+                out_channels,
+                out_channels,
+                (3, 3),
+                Some((1, 1)),
+                Some((1, 1)),
+                true,
+            ),
+            bn2: BatchNorm2d::new(out_channels, 1e-5, 0.1, true),
+            downsample,
+            relu: ReLU::new(),
+        }
+    }
+
+    pub fn forward(&mut self, x: &Tensor) -> Result<Tensor, BellandeError> {
+        let identity = if let Some(ref mut ds) = self.downsample {
+            ds.forward(x)?
+        } else {
+            x.clone()
+        };
+
+        let mut out = NeuralLayer::forward(&mut self.conv1, x)?;
+        out = NeuralLayer::forward(&mut self.bn1, &out)?;
+        out = Activation::forward(&self.relu, &out)?;
+
+        out = NeuralLayer::forward(&mut self.conv2, &out)?;
+        out = NeuralLayer::forward(&mut self.bn2, &out)?;
+
+        // Use element-wise addition
+        out = out.add(&identity)?;
+        out = Activation::forward(&self.relu, &out)?;
+
+        Ok(out)
+    }
+}
+
+pub struct ResNet {
+    conv1: Conv2d,
+    bn1: BatchNorm2d,
+    relu: ReLU,
+    maxpool: MaxPool2d,
+    layer1: Vec<ResidualBlock>,
+    layer2: Vec<ResidualBlock>,
+    layer3: Vec<ResidualBlock>,
+    layer4: Vec<ResidualBlock>,
+    avgpool: AvgPool2d,
+    fc: Linear,
+}
+
+impl ResNet {
+    pub fn resnet18(num_classes: usize) -> Self {
+        ResNet {
+            conv1: Conv2d::new(3, 64, (7, 7), Some((2, 2)), Some((3, 3)), true),
+            bn1: BatchNorm2d::new(64, 1e-5, 0.1, true),
+            relu: ReLU::new(),
+            maxpool: MaxPool2d::new((3, 3), Some((2, 2))),
+            layer1: make_layer(64, 64, 2, 1),
+            layer2: make_layer(64, 128, 2, 2),
+            layer3: make_layer(128, 256, 2, 2),
+            layer4: make_layer(256, 512, 2, 2),
+            avgpool: AvgPool2d::new((7, 7), Some((1, 1)), None),
+            fc: Linear::new(512, num_classes, true),
+        }
+    }
+
+    pub fn forward(&mut self, x: &Tensor) -> Result<Tensor, BellandeError> {
+        let mut out = NeuralLayer::forward(&mut self.conv1, x)?;
+        out = NeuralLayer::forward(&mut self.bn1, &out)?;
+        out = Activation::forward(&self.relu, &out)?;
+        out = NeuralLayer::forward(&mut self.maxpool, &out)?;
+
+        for block in &mut self.layer1 {
+            out = block.forward(&out)?;
+        }
+        for block in &mut self.layer2 {
+            out = block.forward(&out)?;
+        }
+        for block in &mut self.layer3 {
+            out = block.forward(&out)?;
+        }
+        for block in &mut self.layer4 {
+            out = block.forward(&out)?;
+        }
+
+        out = NeuralLayer::forward(&mut self.avgpool, &out)?;
+
+        // Calculate flattened size for reshape
+        let batch_size = out.shape[0];
+        let total_features = out.data.len() / batch_size;
+        out = out.reshape(&[batch_size, total_features])?;
+
+        out = NeuralLayer::forward(&mut self.fc, &out)?;
+
+        Ok(out)
+    }
+}
+
+fn make_layer(
+    in_channels: usize,
+    out_channels: usize,
+    blocks: usize,
+    stride: usize,
+) -> Vec<ResidualBlock> {
+    let mut layers = Vec::new();
+
+    let downsample = if stride != 1 || in_channels != out_channels {
+        let mut sequential = Sequential::new();
+        sequential.add(Box::new(Conv2d::new(
+            in_channels,
+            out_channels,
+            (1, 1),
+            Some((stride, stride)),
+            Some((0, 0)),
+            true,
+        )));
+        sequential.add(Box::new(BatchNorm2d::new(out_channels, 1e-5, 0.1, true)));
+        Some(sequential)
+    } else {
+        None
+    };
+
+    layers.push(ResidualBlock::new(
+        in_channels,
+        out_channels,
+        stride,
+        downsample,
+    ));
+
+    for _ in 1..blocks {
+        layers.push(ResidualBlock::new(out_channels, out_channels, 1, None));
+    }
+
+    layers
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for ResNet {}
+unsafe impl Sync for ResNet {}
diff --git a/src/models/sequential.rs b/src/models/sequential.rs
new file mode 100644
index 0000000..50bb02c
--- /dev/null
+++ b/src/models/sequential.rs
@@ -0,0 +1,137 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+
+/// Trait defining a neural network layer
+pub trait NeuralLayer: Send + Sync {
+    /// Forward pass
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError>;
+
+    /// Backward pass
+    fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError>;
+
+    /// Get layer parameters
+    fn parameters(&self) -> Vec<Tensor>;
+
+    /// Get named parameters
+    fn named_parameters(&self) -> Vec<(String, Tensor)>;
+
+    /// Set parameter value
+    fn set_parameter(&mut self, name: &str, value: Tensor) -> Result<(), BellandeError>;
+
+    /// Set layer to training mode
+    fn train(&mut self);
+
+    /// Set layer to evaluation mode
+    fn eval(&mut self);
+}
+
+/// Sequential container for neural network layers
+pub struct Sequential {
+    pub(crate) layers: Vec<Box<dyn NeuralLayer>>,
+    pub(crate) training: bool,
+}
+
+impl Sequential {
+    /// Creates a new empty Sequential container
+    pub fn new() -> Self {
+        Sequential {
+            layers: Vec::new(),
+            training: true,
+        }
+    }
+
+    /// Adds a layer to the container and returns mutable reference for chaining
+    pub fn add(&mut self, layer: Box<dyn NeuralLayer>) -> &mut Self {
+        self.layers.push(layer);
+        self
+    }
+
+    /// Forward pass through all layers
+    pub fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        let mut current = input.clone();
+        for layer in &mut self.layers {
+            current = layer.forward(&current)?;
+        }
+        Ok(current)
+    }
+
+    /// Backward pass through all layers in reverse order
+    pub fn backward(&mut self, grad: &Tensor) -> Result<Tensor, BellandeError> {
+        if !self.training {
+            return Err(BellandeError::InvalidBackward(
+                "Forward pass not called before backward".into(),
+            ))?;
+        }
+
+        let mut current_grad = grad.clone();
+        for layer in self.layers.iter_mut().rev() {
+            current_grad = layer.backward(&current_grad)?;
+        }
+        Ok(current_grad)
+    }
+
+    /// Get all parameters from all layers
+    pub fn parameters(&self) -> Vec<Tensor> {
+        self.layers
+            .iter()
+            .flat_map(|layer| layer.parameters())
+            .collect()
+    }
+
+    /// Get number of layers
+    pub fn len(&self) -> usize {
+        self.layers.len()
+    }
+
+    /// Check if container is empty
+    pub fn is_empty(&self) -> bool {
+        self.layers.is_empty()
+    }
+
+    /// Get layer at index
+    pub fn get_layer(&self, index: usize) -> Option<&Box<dyn NeuralLayer>> {
+        self.layers.get(index)
+    }
+
+    /// Get mutable layer at index
+    pub fn get_layer_mut(&mut self, index: usize) -> Option<&mut Box<dyn NeuralLayer>> {
+        self.layers.get_mut(index)
+    }
+
+    /// Set model to training mode
+    pub fn train(&mut self) {
+        self.training = true;
+        for layer in &mut self.layers {
+            layer.train();
+        }
+    }
+
+    /// Set model to evaluation mode
+    pub fn eval(&mut self) {
+        self.training = false;
+        for layer in &mut self.layers {
+            layer.eval();
+        }
+    }
+}
+
+// Implement Default for Sequential
+impl Default for Sequential {
+    fn default() -> Self {
+        Self::new()
+    }
+}
diff --git a/src/models/vgg.rs b/src/models/vgg.rs
new file mode 100644
index 0000000..bf9ce3f
--- /dev/null
+++ b/src/models/vgg.rs
@@ -0,0 +1,239 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::layer::{
+    activation::ReLU, avgpool2d::AvgPool2d, conv::Conv2d, dropout::Dropout, linear::Linear,
+    pooling::MaxPool2d,
+};
+use crate::models::sequential::{NeuralLayer, Sequential};
+
+pub struct VGG {
+    features: Sequential,
+    avgpool: AvgPool2d,
+    classifier: Sequential,
+}
+
+impl VGG {
+    pub fn vgg16(num_classes: usize) -> Result<Self, BellandeError> {
+        // Changed to return Result
+        let mut features = Sequential::new();
+
+        // Block 1
+        features.add(Box::new(Conv2d::new(
+            3,
+            64,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            64,
+            64,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(MaxPool2d::new((2, 2), Some((2, 2)))));
+
+        // Block 2
+        features.add(Box::new(Conv2d::new(
+            64,
+            128,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            128,
+            128,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(MaxPool2d::new((2, 2), Some((2, 2)))));
+
+        // Block 3
+        features.add(Box::new(Conv2d::new(
+            128,
+            256,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            256,
+            256,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            256,
+            256,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(MaxPool2d::new((2, 2), Some((2, 2)))));
+
+        // Block 4
+        features.add(Box::new(Conv2d::new(
+            256,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            512,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            512,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(MaxPool2d::new((2, 2), Some((2, 2)))));
+
+        // Block 5
+        features.add(Box::new(Conv2d::new(
+            512,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            512,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(Conv2d::new(
+            512,
+            512,
+            (3, 3),
+            Some((1, 1)),
+            Some((1, 1)),
+            true,
+        )));
+        features.add(Box::new(ReLU::new()));
+        features.add(Box::new(MaxPool2d::new((2, 2), Some((2, 2)))));
+
+        // Classifier
+        let mut classifier = Sequential::new();
+        classifier.add(Box::new(Linear::new(512 * 7 * 7, 4096, true)));
+        classifier.add(Box::new(ReLU::new()));
+        classifier.add(Box::new(Dropout::new(0.5)?)); // Handle Result
+        classifier.add(Box::new(Linear::new(4096, 4096, true)));
+        classifier.add(Box::new(ReLU::new()));
+        classifier.add(Box::new(Dropout::new(0.5)?)); // Handle Result
+        classifier.add(Box::new(Linear::new(4096, num_classes, true)));
+
+        Ok(VGG {
+            // Wrap in Ok
+            features,
+            avgpool: AvgPool2d::new(
+                (7, 7),       // kernel_size
+                Some((1, 1)), // stride
+                None,         // padding
+            ),
+            classifier,
+        })
+    }
+
+    pub fn forward(&mut self, x: &Tensor) -> Result<Tensor, BellandeError> {
+        // Feature extraction
+        let mut out = self.features.forward(x)?;
+
+        // Average pooling
+        out = NeuralLayer::forward(&mut self.avgpool, &out)?; // Use trait method explicitly
+
+        // Flatten the tensor properly
+        let batch_size = out.shape[0];
+        let flattened_size = out.data.len() / batch_size;
+
+        // Use reshape instead of view
+        out = out.reshape(&[batch_size, flattened_size])?;
+
+        // Classification
+        out = self.classifier.forward(&out)?;
+
+        Ok(out)
+    }
+}
+
+// Add NeuralLayer implementation for ReLU
+impl NeuralLayer for ReLU {
+    fn forward(&mut self, input: &Tensor) -> Result<Tensor, BellandeError> {
+        self.forward(input)
+    }
+
+    fn backward(&mut self, grad_output: &Tensor) -> Result<Tensor, BellandeError> {
+        self.backward(grad_output)
+    }
+
+    fn parameters(&self) -> Vec<Tensor> {
+        Vec::new() // ReLU has no parameters
+    }
+
+    fn named_parameters(&self) -> Vec<(String, Tensor)> {
+        Vec::new() // ReLU has no parameters
+    }
+
+    fn set_parameter(&mut self, _name: &str, _value: Tensor) -> Result<(), BellandeError> {
+        Err(BellandeError::InvalidParameter(
+            "ReLU has no parameters".into(),
+        ))
+    }
+
+    fn train(&mut self) {} // ReLU doesn't have training mode
+    fn eval(&mut self) {} // ReLU doesn't have eval mode
+}
+
+// Implement Send and Sync
+unsafe impl Send for VGG {}
+unsafe impl Sync for VGG {}
diff --git a/src/optim/adam.rs b/src/optim/adam.rs
new file mode 100644
index 0000000..5ffe314
--- /dev/null
+++ b/src/optim/adam.rs
@@ -0,0 +1,175 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::optim::{Optimizer, OptimizerState, ParameterGroup};
+use std::collections::HashMap;
+
+pub struct Adam {
+    params: Vec<Tensor>,
+    lr: f32,
+    betas: (f32, f32),
+    eps: f32,
+    weight_decay: f32,
+    m: HashMap<usize, Vec<f32>>,
+    v: HashMap<usize, Vec<f32>>,
+    param_groups: Vec<ParameterGroup>,
+    state: OptimizerState,
+}
+
+impl Adam {
+    pub fn new(
+        params: Vec<Tensor>,
+        lr: f32,
+        betas: (f32, f32),
+        eps: f32,
+        weight_decay: f32,
+    ) -> Self {
+        let mut m = HashMap::new();
+        let mut v = HashMap::new();
+        for (idx, param) in params.iter().enumerate() {
+            m.insert(idx, vec![0.0; param.data.len()]);
+            v.insert(idx, vec![0.0; param.data.len()]);
+        }
+
+        // Create default parameter group
+        let default_group = ParameterGroup::new(params.clone())
+            .with_lr(lr)
+            .with_weight_decay(weight_decay)
+            .with_betas(betas.0, betas.1)
+            .with_eps(eps);
+
+        Adam {
+            params,
+            lr,
+            betas,
+            eps,
+            weight_decay,
+            m,
+            v,
+            param_groups: vec![default_group],
+            state: OptimizerState::new(),
+        }
+    }
+}
+
+// Implement the Optimizer trait for Adam
+impl Optimizer for Adam {
+    fn step(&mut self) -> Result<(), BellandeError> {
+        self.state.increment_step();
+        let bias_correction1 = 1.0 - self.betas.0.powi(self.state.step as i32);
+        let bias_correction2 = 1.0 - self.betas.1.powi(self.state.step as i32);
+
+        for (idx, param) in self.params.iter_mut().enumerate() {
+            if let Some(grad) = &param.grad {
+                let m = self.m.get_mut(&idx).unwrap();
+                let v = self.v.get_mut(&idx).unwrap();
+
+                // Apply updates with proper vectorization
+                for ((p, g), (m_i, v_i)) in param
+                    .data
+                    .iter_mut()
+                    .zip(grad.iter())
+                    .zip(m.iter_mut().zip(v.iter_mut()))
+                {
+                    let mut d_p = *g;
+                    if self.weight_decay != 0.0 {
+                        d_p += self.weight_decay * *p;
+                    }
+
+                    // Update biased first moment estimate
+                    *m_i = self.betas.0 * *m_i + (1.0 - self.betas.0) * d_p;
+                    // Update biased second raw moment estimate
+                    *v_i = self.betas.1 * *v_i + (1.0 - self.betas.1) * d_p * d_p;
+
+                    // Compute bias-corrected estimates
+                    let m_hat = *m_i / bias_correction1;
+                    let v_hat = *v_i / bias_correction2;
+
+                    // Update parameters
+                    *p -= self.lr * m_hat / (v_hat.sqrt() + self.eps);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn zero_grad(&mut self) {
+        for param in &mut self.params {
+            if let Some(grad) = &mut param.grad {
+                grad.iter_mut().for_each(|g| *g = 0.0);
+            }
+        }
+    }
+
+    fn get_lr(&self) -> f32 {
+        self.lr
+    }
+
+    fn set_lr(&mut self, lr: f32) {
+        self.lr = lr;
+        for group in &mut self.param_groups {
+            group.lr = lr;
+        }
+    }
+
+    fn parameters(&self) -> &Vec<Tensor> {
+        &self.params
+    }
+
+    fn parameters_mut(&mut self) -> &mut Vec<Tensor> {
+        &mut self.params
+    }
+
+    fn name(&self) -> &str {
+        "Adam"
+    }
+
+    fn get_param_groups(&self) -> &[ParameterGroup] {
+        &self.param_groups
+    }
+
+    fn get_param_groups_mut(&mut self) -> &mut [ParameterGroup] {
+        &mut self.param_groups
+    }
+
+    fn add_param_group(&mut self, mut group: ParameterGroup) {
+        let start_idx = self.params.len();
+
+        // Initialize momentum and velocity for new parameters
+        for (i, param) in group.params.iter().enumerate() {
+            self.m.insert(start_idx + i, vec![0.0; param.data.len()]);
+            self.v.insert(start_idx + i, vec![0.0; param.data.len()]);
+        }
+
+        // Update params list
+        self.params.extend(group.params.clone());
+
+        // Add the group
+        self.param_groups.push(group);
+    }
+
+    fn state(&self) -> &OptimizerState {
+        &self.state
+    }
+
+    fn state_mut(&mut self) -> &mut OptimizerState {
+        &mut self.state
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for Adam {}
+unsafe impl Sync for Adam {}
diff --git a/src/optim/mod.rs b/src/optim/mod.rs
new file mode 100644
index 0000000..1acfb84
--- /dev/null
+++ b/src/optim/mod.rs
@@ -0,0 +1,174 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use std::collections::HashMap;
+
+pub mod adam;
+pub mod rmsprop;
+pub mod scheduler;
+pub mod sgd;
+
+pub trait Optimizer: Send + Sync {
+    fn step(&mut self) -> Result<(), BellandeError>;
+    fn zero_grad(&mut self);
+    fn parameters(&self) -> &Vec<Tensor>;
+    fn parameters_mut(&mut self) -> &mut Vec<Tensor>;
+    fn get_lr(&self) -> f32;
+    fn set_lr(&mut self, lr: f32);
+    fn name(&self) -> &str {
+        "GenericOptimizer"
+    }
+    fn get_param_groups(&self) -> &[ParameterGroup];
+    fn get_param_groups_mut(&mut self) -> &mut [ParameterGroup];
+    fn add_param_group(&mut self, group: ParameterGroup);
+    fn state(&self) -> &OptimizerState;
+    fn state_mut(&mut self) -> &mut OptimizerState;
+}
+
+#[derive(Clone)]
+pub struct ParameterGroup {
+    pub params: Vec<Tensor>,
+    pub lr: f32,
+    pub weight_decay: f32,
+    pub momentum: Option<f32>,
+    pub betas: Option<(f32, f32)>,
+    pub eps: f32,
+}
+
+impl ParameterGroup {
+    pub fn new(params: Vec<Tensor>) -> Self {
+        Self {
+            params,
+            lr: 0.001,
+            weight_decay: 0.0,
+            momentum: None,
+            betas: None,
+            eps: 1e-8,
+        }
+    }
+
+    pub fn with_lr(mut self, lr: f32) -> Self {
+        self.lr = lr;
+        self
+    }
+
+    pub fn with_weight_decay(mut self, weight_decay: f32) -> Self {
+        self.weight_decay = weight_decay;
+        self
+    }
+
+    pub fn with_momentum(mut self, momentum: f32) -> Self {
+        self.momentum = Some(momentum);
+        self
+    }
+
+    pub fn with_betas(mut self, beta1: f32, beta2: f32) -> Self {
+        self.betas = Some((beta1, beta2));
+        self
+    }
+
+    pub fn with_eps(mut self, eps: f32) -> Self {
+        self.eps = eps;
+        self
+    }
+}
+
+#[derive(Default)]
+pub struct OptimizerState {
+    pub step: usize,
+    pub state_dict: HashMap<String, Tensor>,
+}
+
+impl OptimizerState {
+    pub fn new() -> Self {
+        Self {
+            step: 0,
+            state_dict: HashMap::new(),
+        }
+    }
+
+    pub fn increment_step(&mut self) {
+        self.step += 1;
+    }
+
+    pub fn get_state(&self, key: &str) -> Option<&Tensor> {
+        self.state_dict.get(key)
+    }
+
+    pub fn set_state(&mut self, key: String, value: Tensor) {
+        self.state_dict.insert(key, value);
+    }
+}
+
+pub trait LearningRateScheduler: Send + Sync {
+    fn step(&mut self, epoch: usize, metrics: &HashMap<String, f32>) -> Result<(), BellandeError>;
+    fn get_last_lr(&self) -> f32;
+    fn name(&self) -> &str {
+        "GenericScheduler"
+    }
+}
+
+pub mod utils {
+    use super::*;
+
+    pub fn apply_weight_decay(param: &mut Tensor, weight_decay: f32) -> Result<(), BellandeError> {
+        if weight_decay != 0.0 {
+            if let Some(ref grad) = param.grad {
+                let mut grad_data = grad.clone();
+                for (g, p) in grad_data.iter_mut().zip(param.data.iter()) {
+                    *g += weight_decay * p;
+                }
+                param.grad = Some(grad_data);
+            }
+        }
+        Ok(())
+    }
+
+    pub fn clip_grad_norm(
+        parameters: &[Tensor],
+        max_norm: f32,
+        norm_type: f32,
+    ) -> Result<f32, BellandeError> {
+        let total_norm = compute_grad_norm(parameters, norm_type)?;
+
+        if total_norm > max_norm {
+            let scale = max_norm / (total_norm + 1e-6);
+            for param in parameters {
+                if let Some(ref grad) = param.grad {
+                    let mut scaled_grad = grad.clone();
+                    for g in scaled_grad.iter_mut() {
+                        *g *= scale;
+                    }
+                }
+            }
+        }
+
+        Ok(total_norm)
+    }
+
+    fn compute_grad_norm(parameters: &[Tensor], norm_type: f32) -> Result<f32, BellandeError> {
+        let mut total_norm = 0.0;
+
+        for param in parameters {
+            if let Some(ref grad) = param.grad {
+                let param_norm: f32 = grad.iter().map(|&x| x.abs().powf(norm_type)).sum::<f32>();
+                total_norm += param_norm;
+            }
+        }
+
+        Ok(total_norm.powf(1.0 / norm_type))
+    }
+}
diff --git a/src/optim/rmsprop.rs b/src/optim/rmsprop.rs
new file mode 100644
index 0000000..4395822
--- /dev/null
+++ b/src/optim/rmsprop.rs
@@ -0,0 +1,206 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::optim::{Optimizer, OptimizerState, ParameterGroup};
+use std::collections::HashMap;
+
+pub struct RMSprop {
+    params: Vec<Tensor>,
+    lr: f32,
+    alpha: f32,
+    eps: f32,
+    weight_decay: f32,
+    momentum: f32,
+    centered: bool,
+    v: HashMap<usize, Vec<f32>>,   // Square average
+    g: HashMap<usize, Vec<f32>>,   // Gradient average (if centered)
+    buf: HashMap<usize, Vec<f32>>, // Momentum buffer
+    param_groups: Vec<ParameterGroup>,
+    state: OptimizerState,
+}
+
+impl RMSprop {
+    pub fn new(
+        params: Vec<Tensor>,
+        lr: f32,
+        alpha: f32,
+        eps: f32,
+        weight_decay: f32,
+        momentum: f32,
+        centered: bool,
+    ) -> Self {
+        let mut v = HashMap::new();
+        let mut g = HashMap::new();
+        let mut buf = HashMap::new();
+
+        for (idx, param) in params.iter().enumerate() {
+            v.insert(idx, vec![0.0; param.data.len()]);
+            if centered {
+                g.insert(idx, vec![0.0; param.data.len()]);
+            }
+            if momentum > 0.0 {
+                buf.insert(idx, vec![0.0; param.data.len()]);
+            }
+        }
+
+        // Create default parameter group
+        let default_group = ParameterGroup::new(params.clone())
+            .with_lr(lr)
+            .with_weight_decay(weight_decay)
+            .with_momentum(momentum)
+            .with_eps(eps);
+
+        RMSprop {
+            params,
+            lr,
+            alpha,
+            eps,
+            weight_decay,
+            momentum,
+            centered,
+            v,
+            g,
+            buf,
+            param_groups: vec![default_group],
+            state: OptimizerState::new(),
+        }
+    }
+}
+
+impl Optimizer for RMSprop {
+    fn step(&mut self) -> Result<(), BellandeError> {
+        self.state.increment_step();
+
+        for (idx, param) in self.params.iter_mut().enumerate() {
+            if let Some(grad) = &param.grad {
+                let v = self.v.get_mut(&idx).unwrap();
+                let mut g = if self.centered {
+                    Some(self.g.get_mut(&idx).unwrap())
+                } else {
+                    None
+                };
+                let mut buf = if self.momentum > 0.0 {
+                    Some(self.buf.get_mut(&idx).unwrap())
+                } else {
+                    None
+                };
+
+                // Process all elements for this parameter
+                for i in 0..param.data.len() {
+                    let grad_val = grad[i];
+                    let mut final_grad = grad_val;
+
+                    // Apply weight decay if needed
+                    if self.weight_decay != 0.0 {
+                        final_grad += self.weight_decay * param.data[i];
+                    }
+
+                    // Update running average of squared gradients
+                    v[i] = self.alpha * v[i] + (1.0 - self.alpha) * final_grad * final_grad;
+
+                    if let Some(g_avg) = &mut g {
+                        // Update gradient average for centered variant
+                        g_avg[i] = self.alpha * g_avg[i] + (1.0 - self.alpha) * final_grad;
+                        let denom = (v[i].sqrt() - g_avg[i].powi(2) + self.eps).sqrt();
+                        final_grad /= denom;
+                    } else {
+                        final_grad /= (v[i] + self.eps).sqrt();
+                    }
+
+                    if let Some(buf_val) = &mut buf {
+                        // Apply momentum if enabled
+                        buf_val[i] = self.momentum * buf_val[i] + final_grad;
+                        param.data[i] -= self.lr * buf_val[i];
+                    } else {
+                        param.data[i] -= self.lr * final_grad;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+    fn zero_grad(&mut self) {
+        for param in &mut self.params {
+            if let Some(grad) = &mut param.grad {
+                grad.iter_mut().for_each(|g| *g = 0.0);
+            }
+        }
+    }
+
+    fn get_lr(&self) -> f32 {
+        self.lr
+    }
+
+    fn set_lr(&mut self, lr: f32) {
+        self.lr = lr;
+        for group in &mut self.param_groups {
+            group.lr = lr;
+        }
+    }
+
+    fn parameters(&self) -> &Vec<Tensor> {
+        &self.params
+    }
+
+    fn parameters_mut(&mut self) -> &mut Vec<Tensor> {
+        &mut self.params
+    }
+
+    fn name(&self) -> &str {
+        "RMSprop"
+    }
+
+    fn get_param_groups(&self) -> &[ParameterGroup] {
+        &self.param_groups
+    }
+
+    fn get_param_groups_mut(&mut self) -> &mut [ParameterGroup] {
+        &mut self.param_groups
+    }
+
+    fn add_param_group(&mut self, mut group: ParameterGroup) {
+        let start_idx = self.params.len();
+
+        // Initialize state for new parameters
+        for (i, param) in group.params.iter().enumerate() {
+            self.v.insert(start_idx + i, vec![0.0; param.data.len()]);
+            if self.centered {
+                self.g.insert(start_idx + i, vec![0.0; param.data.len()]);
+            }
+            if self.momentum > 0.0 {
+                self.buf.insert(start_idx + i, vec![0.0; param.data.len()]);
+            }
+        }
+
+        // Update params list
+        self.params.extend(group.params.clone());
+
+        // Add the group
+        self.param_groups.push(group);
+    }
+
+    fn state(&self) -> &OptimizerState {
+        &self.state
+    }
+
+    fn state_mut(&mut self) -> &mut OptimizerState {
+        &mut self.state
+    }
+}
+
+// Implement Send and Sync for thread safety
+unsafe impl Send for RMSprop {}
+unsafe impl Sync for RMSprop {}
diff --git a/src/optim/scheduler.rs b/src/optim/scheduler.rs
new file mode 100644
index 0000000..9c29d47
--- /dev/null
+++ b/src/optim/scheduler.rs
@@ -0,0 +1,101 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::error::BellandeError;
+
+pub trait LRScheduler {
+    fn step(&mut self);
+    fn get_last_lr(&self) -> f32;
+}
+
+pub struct StepLR {
+    optimizer: Box<dyn Optimizer>,
+    step_size: usize,
+    gamma: f32,
+    base_lr: f32,
+    current_step: usize,
+}
+
+impl StepLR {
+    pub fn new(optimizer: Box<dyn Optimizer>, step_size: usize, gamma: f32) -> Self {
+        let base_lr = optimizer.get_lr();
+        StepLR {
+            optimizer,
+            step_size,
+            gamma,
+            base_lr,
+            current_step: 0,
+        }
+    }
+}
+
+impl LRScheduler for StepLR {
+    fn step(&mut self) {
+        self.current_step += 1;
+        if self.current_step % self.step_size == 0 {
+            let new_lr =
+                self.base_lr * self.gamma.powi((self.current_step / self.step_size) as i32);
+            self.optimizer.set_lr(new_lr);
+        }
+    }
+
+    fn get_last_lr(&self) -> f32 {
+        self.optimizer.get_lr()
+    }
+}
+
+pub struct CosineAnnealingLR {
+    optimizer: Box<dyn Optimizer>,
+    T_max: usize,
+    eta_min: f32,
+    base_lr: f32,
+    current_step: usize,
+}
+
+impl CosineAnnealingLR {
+    pub fn new(optimizer: Box<dyn Optimizer>, T_max: usize, eta_min: f32) -> Self {
+        let base_lr = optimizer.get_lr();
+        CosineAnnealingLR {
+            optimizer,
+            T_max,
+            eta_min,
+            base_lr,
+            current_step: 0,
+        }
+    }
+}
+
+impl LRScheduler for CosineAnnealingLR {
+    fn step(&mut self) {
+        self.current_step += 1;
+        let current_step = self.current_step.min(self.T_max);
+        let new_lr = self.eta_min
+            + (self.base_lr - self.eta_min)
+                * (1.0 + std::f32::consts::PI * current_step as f32 / self.T_max as f32).cos()
+                / 2.0;
+        self.optimizer.set_lr(new_lr);
+    }
+
+    fn get_last_lr(&self) -> f32 {
+        self.optimizer.get_lr()
+    }
+}
+
+pub trait Optimizer {
+    fn step(&mut self) -> Result<(), BellandeError>;
+    fn zero_grad(&mut self);
+    fn get_lr(&self) -> f32;
+    fn set_lr(&mut self, lr: f32);
+}
diff --git a/src/optim/sgd.rs b/src/optim/sgd.rs
new file mode 100644
index 0000000..410d8aa
--- /dev/null
+++ b/src/optim/sgd.rs
@@ -0,0 +1,174 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::optim::{Optimizer, OptimizerState, ParameterGroup};
+use std::collections::HashMap;
+
+pub struct SGD {
+    params: Vec<Tensor>,
+    lr: f32,
+    momentum: f32,
+    weight_decay: f32,
+    nesterov: bool,
+    velocity: HashMap<usize, Vec<f32>>,
+    param_groups: Vec<ParameterGroup>,
+    state: OptimizerState,
+}
+
+impl SGD {
+    pub fn new(
+        params: Vec<Tensor>,
+        lr: f32,
+        momentum: f32,
+        weight_decay: f32,
+        nesterov: bool,
+    ) -> Self {
+        let mut velocity = HashMap::new();
+        if momentum > 0.0 {
+            for (idx, param) in params.iter().enumerate() {
+                velocity.insert(idx, vec![0.0; param.data.len()]);
+            }
+        }
+
+        // Create initial parameter group with correct types
+        let default_group = ParameterGroup::new(params.clone())
+            .with_lr(lr)
+            .with_weight_decay(weight_decay)
+            .with_momentum(momentum)
+            .with_eps(1e-8); // Default epsilon value
+
+        SGD {
+            params,
+            lr,
+            momentum,
+            weight_decay,
+            nesterov,
+            velocity,
+            param_groups: vec![default_group],
+            state: OptimizerState::default(),
+        }
+    }
+
+    pub fn step(&mut self) -> Result<(), BellandeError> {
+        for (idx, param) in self.params.iter_mut().enumerate() {
+            if let Some(grad) = &param.grad {
+                let v_ref = if self.momentum > 0.0 {
+                    self.velocity.get_mut(&idx)
+                } else {
+                    None
+                };
+
+                if let Some(v) = v_ref {
+                    // Case with momentum
+                    for ((p, g), v_i) in param.data.iter_mut().zip(grad.iter()).zip(v.iter_mut()) {
+                        let mut d_p = *g;
+                        if self.weight_decay != 0.0 {
+                            d_p += self.weight_decay * *p;
+                        }
+
+                        *v_i = self.momentum * *v_i + d_p;
+                        if self.nesterov {
+                            d_p += self.momentum * *v_i;
+                        } else {
+                            d_p = *v_i;
+                        }
+                        *p -= self.lr * d_p;
+                    }
+                } else {
+                    // Case without momentum
+                    for (p, g) in param.data.iter_mut().zip(grad.iter()) {
+                        let mut d_p = *g;
+                        if self.weight_decay != 0.0 {
+                            d_p += self.weight_decay * *p;
+                        }
+                        *p -= self.lr * d_p;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
+    pub fn zero_grad(&mut self) {
+        for param in &mut self.params {
+            if let Some(grad) = &mut param.grad {
+                grad.iter_mut().for_each(|g| *g = 0.0);
+            }
+        }
+    }
+}
+
+// Implement the Optimizer trait
+impl Optimizer for SGD {
+    fn step(&mut self) -> Result<(), BellandeError> {
+        self.step()
+    }
+
+    fn zero_grad(&mut self) {
+        self.zero_grad()
+    }
+
+    fn get_lr(&self) -> f32 {
+        self.lr
+    }
+
+    fn set_lr(&mut self, lr: f32) {
+        self.lr = lr;
+    }
+
+    fn parameters(&self) -> &Vec<Tensor> {
+        &self.params
+    }
+
+    fn parameters_mut(&mut self) -> &mut Vec<Tensor> {
+        &mut self.params
+    }
+
+    // Add the missing required methods
+    fn get_param_groups(&self) -> &[ParameterGroup] {
+        &self.param_groups
+    }
+
+    fn get_param_groups_mut(&mut self) -> &mut [ParameterGroup] {
+        &mut self.param_groups
+    }
+
+    fn add_param_group(&mut self, mut group: ParameterGroup) {
+        let start_idx = self.params.len();
+
+        if self.momentum > 0.0 {
+            for (i, param) in group.params.iter().enumerate() {
+                self.velocity
+                    .insert(start_idx + i, vec![0.0; param.data.len()]);
+            }
+        }
+
+        self.params.extend(group.params.clone());
+        self.param_groups.push(group);
+    }
+
+    fn state(&self) -> &OptimizerState {
+        &self.state
+    }
+
+    fn state_mut(&mut self) -> &mut OptimizerState {
+        &mut self.state
+    }
+}
+
+// Implement Send and Sync
+unsafe impl Send for SGD {}
+unsafe impl Sync for SGD {}
diff --git a/src/training/callbacks.rs b/src/training/callbacks.rs
new file mode 100644
index 0000000..c563729
--- /dev/null
+++ b/src/training/callbacks.rs
@@ -0,0 +1,113 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{error::BellandeError, tensor::Tensor};
+use std::collections::HashMap;
+
+pub trait Callback: Send + Sync {
+    fn on_epoch_begin(
+        &mut self,
+        epoch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        Ok(())
+    }
+    fn on_epoch_end(
+        &mut self,
+        epoch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        Ok(())
+    }
+    fn on_batch_begin(
+        &mut self,
+        batch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        Ok(())
+    }
+    fn on_batch_end(
+        &mut self,
+        batch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        Ok(())
+    }
+    fn on_train_begin(&mut self, logs: &HashMap<String, f32>) -> Result<(), BellandeError> {
+        Ok(())
+    }
+    fn on_train_end(&mut self, logs: &HashMap<String, f32>) -> Result<(), BellandeError> {
+        Ok(())
+    }
+}
+
+pub struct EarlyStopping {
+    patience: usize,
+    min_delta: f32,
+    monitor: String,
+    best_value: f32,
+    wait: usize,
+    stopped_epoch: usize,
+    restore_best_weights: bool,
+    best_weights: Option<Vec<Tensor>>,
+}
+
+impl EarlyStopping {
+    pub fn new(
+        patience: usize,
+        min_delta: f32,
+        monitor: String,
+        restore_best_weights: bool,
+    ) -> Self {
+        EarlyStopping {
+            patience,
+            min_delta,
+            monitor,
+            best_value: f32::INFINITY,
+            wait: 0,
+            stopped_epoch: 0,
+            restore_best_weights,
+            best_weights: None,
+        }
+    }
+}
+
+impl Callback for EarlyStopping {
+    fn on_epoch_end(
+        &mut self,
+        epoch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        if let Some(&current) = logs.get(&self.monitor) {
+            if current < self.best_value - self.min_delta {
+                self.best_value = current;
+                self.wait = 0;
+                if self.restore_best_weights {
+                    // Save current weights
+                }
+            } else {
+                self.wait += 1;
+                if self.wait >= self.patience {
+                    self.stopped_epoch = epoch;
+                    return Err(BellandeError::EarlyStopping(format!(
+                        "Stopped at epoch {}",
+                        epoch
+                    )));
+                }
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/src/training/checkpoint.rs b/src/training/checkpoint.rs
new file mode 100644
index 0000000..2cfe737
--- /dev/null
+++ b/src/training/checkpoint.rs
@@ -0,0 +1,568 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, dtype::DataType};
+use crate::core::{error::BellandeError, tensor::Tensor};
+use crate::models::models::Model;
+use crate::training::callbacks::Callback;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fs::{self, File};
+use std::path::{Path, PathBuf};
+
+#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
+pub enum CheckpointMode {
+    Min,
+    Max,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum SaveFormat {
+    Json,
+    Binary,
+}
+
+pub struct ModelCheckpoint {
+    filepath: String,
+    monitor: String,
+    save_best_only: bool,
+    save_weights_only: bool,
+    mode: CheckpointMode,
+    best_value: f32,
+    model: Option<Box<dyn Model>>,
+    save_format: SaveFormat,
+    verbose: bool,
+    keep_best_n: Option<usize>,
+}
+
+#[derive(Serialize, Deserialize)]
+struct CheckpointMetadata {
+    epoch: usize,
+    best_value: f32,
+    monitor: String,
+    mode: CheckpointMode,
+    metrics: HashMap<String, f32>,
+}
+
+impl ModelCheckpoint {
+    pub fn new(
+        filepath: String,
+        monitor: String,
+        save_best_only: bool,
+        save_weights_only: bool,
+        mode: CheckpointMode,
+    ) -> Self {
+        ModelCheckpoint {
+            filepath,
+            monitor,
+            save_best_only,
+            save_weights_only,
+            mode,
+            best_value: match mode {
+                CheckpointMode::Min => f32::INFINITY,
+                CheckpointMode::Max => f32::NEG_INFINITY,
+            },
+            model: None,
+            save_format: SaveFormat::Binary,
+            verbose: true,
+            keep_best_n: None,
+        }
+    }
+
+    pub fn with_model(mut self, model: Box<dyn Model>) -> Self {
+        self.model = Some(model);
+        self
+    }
+
+    pub fn with_save_format(mut self, format: SaveFormat) -> Self {
+        self.save_format = format;
+        self
+    }
+
+    pub fn with_verbose(mut self, verbose: bool) -> Self {
+        self.verbose = verbose;
+        self
+    }
+
+    pub fn with_keep_best_n(mut self, n: usize) -> Self {
+        self.keep_best_n = Some(n);
+        self
+    }
+
+    fn is_better(&self, current: f32) -> bool {
+        match self.mode {
+            CheckpointMode::Min => current < self.best_value,
+            CheckpointMode::Max => current > self.best_value,
+        }
+    }
+
+    fn save_checkpoint(
+        &mut self,
+        filepath: &Path,
+        epoch: usize,
+        metrics: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        // Create directory first
+        if let Some(parent) = filepath.parent() {
+            fs::create_dir_all(parent).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create directory: {}", e))
+            })?;
+        }
+
+        // Extract values we need before borrowing self
+        let save_weights_only = self.save_weights_only;
+        let save_format = self.save_format;
+        let verbose = self.verbose;
+
+        // Get model reference once
+        if let Some(model) = self.model.as_ref() {
+            // Save model/weights without borrowing self again
+            if save_weights_only {
+                save_model_weights(model.as_ref(), filepath, save_format)?;
+            } else {
+                save_model_state(model.as_ref(), filepath, save_format)?;
+            }
+
+            let metadata = CheckpointMetadata {
+                epoch,
+                best_value: self.best_value,
+                monitor: self.monitor.clone(),
+                mode: self.mode,
+                metrics: metrics.clone(),
+            };
+
+            let metadata_path = filepath.with_extension("meta.json");
+            let file = File::create(metadata_path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create metadata file: {}", e))
+            })?;
+
+            serde_json::to_writer_pretty(file, &metadata).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to write metadata: {}", e))
+            })?;
+
+            if verbose {
+                println!("Saved checkpoint to {}", filepath.display());
+            }
+        }
+        Ok(())
+    }
+
+    fn cleanup_old_checkpoints(&mut self, keep_best_n: usize) -> Result<(), BellandeError> {
+        let meta_pattern = self.filepath.replace("{epoch}", "*").replace("{val}", "*");
+        let meta_pattern = format!("{}.meta.json", meta_pattern);
+
+        let mut checkpoints: Vec<_> = glob::glob(&meta_pattern)
+            .map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to read checkpoint directory: {}", e))
+            })?
+            .filter_map(Result::ok)
+            .filter_map(|path| {
+                if let Ok(file) = File::open(&path) {
+                    if let Ok(metadata) = serde_json::from_reader::<_, CheckpointMetadata>(file) {
+                        return Some((path, metadata));
+                    }
+                }
+                None
+            })
+            .collect();
+
+        checkpoints.sort_by(|a, b| {
+            match self.mode {
+                CheckpointMode::Min => a.1.best_value.partial_cmp(&b.1.best_value),
+                CheckpointMode::Max => b.1.best_value.partial_cmp(&a.1.best_value),
+            }
+            .unwrap()
+        });
+
+        for (path, _) in checkpoints.into_iter().skip(keep_best_n) {
+            let base_path = path.with_extension("");
+            if let Err(e) = fs::remove_file(&base_path) {
+                eprintln!(
+                    "Warning: Failed to remove checkpoint file {}: {}",
+                    base_path.display(),
+                    e
+                );
+            }
+            if let Err(e) = fs::remove_file(&path) {
+                eprintln!(
+                    "Warning: Failed to remove metadata file {}: {}",
+                    path.display(),
+                    e
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn save_model_weights(
+    model: &dyn Model,
+    path: &Path,
+    save_format: SaveFormat,
+) -> Result<(), BellandeError> {
+    // Get state dict directly (it's not a Result)
+    let state_dict = model.state_dict();
+    let serializable_state: HashMap<String, Vec<f32>> =
+        state_dict.into_iter().map(|(k, v)| (k, v.data)).collect();
+
+    match save_format {
+        SaveFormat::Json => {
+            let file = File::create(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create weights file: {}", e))
+            })?;
+            serde_json::to_writer(file, &serializable_state).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to serialize weights: {}", e))
+            })?;
+        }
+        SaveFormat::Binary => {
+            let file = File::create(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create weights file: {}", e))
+            })?;
+            bincode::serialize_into(file, &serializable_state).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to serialize weights: {}", e))
+            })?;
+        }
+    }
+    Ok(())
+}
+
+fn save_model_state(
+    model: &dyn Model,
+    path: &Path,
+    save_format: SaveFormat,
+) -> Result<(), BellandeError> {
+    // Get state dict directly
+    let state_dict = model.state_dict();
+    let serializable_state: HashMap<String, Vec<f32>> =
+        state_dict.into_iter().map(|(k, v)| (k, v.data)).collect();
+
+    match save_format {
+        SaveFormat::Json => {
+            let file = File::create(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create model file: {}", e))
+            })?;
+            serde_json::to_writer(file, &serializable_state).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to serialize model: {}", e))
+            })?;
+        }
+        SaveFormat::Binary => {
+            let file = File::create(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create model file: {}", e))
+            })?;
+            bincode::serialize_into(file, &serializable_state).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to serialize model: {}", e))
+            })?;
+        }
+    }
+    Ok(())
+}
+
+fn load_weights_inner(
+    model: &mut dyn Model,
+    path: &Path,
+    save_format: SaveFormat,
+) -> Result<(), BellandeError> {
+    match save_format {
+        SaveFormat::Json => {
+            let file = File::open(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to open weights file: {}", e))
+            })?;
+            let state_vec: HashMap<String, Vec<f32>> =
+                serde_json::from_reader(file).map_err(|e| {
+                    BellandeError::RuntimeError(format!("Failed to deserialize weights: {}", e))
+                })?;
+
+            let state_dict: HashMap<String, Tensor> = state_vec
+                .into_iter()
+                .map(|(k, v)| {
+                    let len = v.len();
+                    (
+                        k,
+                        Tensor {
+                            shape: vec![len],
+                            data: v,
+                            requires_grad: false,
+                            grad: None,
+                            grad_fn: None,
+                            device: Device::CPU,
+                            dtype: DataType::Float32,
+                        },
+                    )
+                })
+                .collect();
+
+            model.load_state_dict(state_dict)?;
+            Ok(())
+        }
+        SaveFormat::Binary => {
+            let file = File::open(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to open weights file: {}", e))
+            })?;
+            let state_vec: HashMap<String, Vec<f32>> =
+                bincode::deserialize_from(file).map_err(|e| {
+                    BellandeError::RuntimeError(format!("Failed to deserialize weights: {}", e))
+                })?;
+
+            let state_dict: HashMap<String, Tensor> = state_vec
+                .into_iter()
+                .map(|(k, v)| {
+                    let len = v.len();
+                    (
+                        k,
+                        Tensor {
+                            shape: vec![len],
+                            data: v,
+                            requires_grad: false,
+                            grad: None,
+                            grad_fn: None,
+                            device: Device::CPU,
+                            dtype: DataType::Float32,
+                        },
+                    )
+                })
+                .collect();
+
+            model.load_state_dict(state_dict)?;
+            Ok(())
+        }
+    }
+}
+
+fn load_model_inner(
+    model: &mut dyn Model,
+    path: &Path,
+    save_format: SaveFormat,
+) -> Result<(), BellandeError> {
+    match save_format {
+        SaveFormat::Json => {
+            let file = File::open(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to open model file: {}", e))
+            })?;
+            let state_vec: HashMap<String, Vec<f32>> =
+                serde_json::from_reader(file).map_err(|e| {
+                    BellandeError::RuntimeError(format!("Failed to deserialize model: {}", e))
+                })?;
+
+            let state_dict: HashMap<String, Tensor> = state_vec
+                .into_iter()
+                .map(|(k, v)| {
+                    let len = v.len();
+                    (
+                        k,
+                        Tensor {
+                            shape: vec![len],
+                            data: v,
+                            requires_grad: false,
+                            grad: None,
+                            grad_fn: None,
+                            device: Device::CPU,
+                            dtype: DataType::Float32,
+                        },
+                    )
+                })
+                .collect();
+
+            model.load_state_dict(state_dict)?;
+            Ok(())
+        }
+        SaveFormat::Binary => {
+            let file = File::open(path).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to open model file: {}", e))
+            })?;
+            let state_vec: HashMap<String, Vec<f32>> =
+                bincode::deserialize_from(file).map_err(|e| {
+                    BellandeError::RuntimeError(format!("Failed to deserialize model: {}", e))
+                })?;
+
+            let state_dict: HashMap<String, Tensor> = state_vec
+                .into_iter()
+                .map(|(k, v)| {
+                    let len = v.len();
+                    (
+                        k,
+                        Tensor {
+                            shape: vec![len],
+                            data: v,
+                            requires_grad: false,
+                            grad: None,
+                            grad_fn: None,
+                            device: Device::CPU,
+                            dtype: DataType::Float32,
+                        },
+                    )
+                })
+                .collect();
+
+            model.load_state_dict(state_dict)?;
+            Ok(())
+        }
+    }
+}
+
+impl Callback for ModelCheckpoint {
+    fn on_epoch_end(
+        &mut self,
+        epoch: usize,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        if let Some(&current) = logs.get(&self.monitor) {
+            if !self.save_best_only || self.is_better(current) {
+                self.best_value = current;
+
+                let filepath = PathBuf::from(
+                    self.filepath
+                        .replace("{epoch}", &epoch.to_string())
+                        .replace("{val}", &format!("{:.4}", current)),
+                );
+
+                self.save_checkpoint(&filepath, epoch, logs)?;
+            }
+        }
+        Ok(())
+    }
+
+    fn on_train_begin(&mut self, _logs: &HashMap<String, f32>) -> Result<(), BellandeError> {
+        if let Some(parent) = Path::new(&self.filepath).parent() {
+            fs::create_dir_all(parent).map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to create checkpoint directory: {}", e))
+            })?;
+        }
+
+        let meta_pattern = self.filepath.replace("{epoch}", "*").replace("{val}", "*");
+        let meta_pattern = format!("{}.meta.json", meta_pattern);
+
+        let existing_checkpoints: Vec<_> = glob::glob(&meta_pattern)
+            .map_err(|e| {
+                BellandeError::RuntimeError(format!("Failed to read checkpoint directory: {}", e))
+            })?
+            .filter_map(Result::ok)
+            .collect();
+
+        if !existing_checkpoints.is_empty() {
+            let mut best_checkpoint = None;
+            let mut best_value = match self.mode {
+                CheckpointMode::Min => f32::INFINITY,
+                CheckpointMode::Max => f32::NEG_INFINITY,
+            };
+
+            for checkpoint_path in existing_checkpoints {
+                if let Ok(file) = File::open(&checkpoint_path) {
+                    if let Ok(metadata) = serde_json::from_reader::<_, CheckpointMetadata>(file) {
+                        if self.is_better(metadata.best_value) {
+                            best_value = metadata.best_value;
+                            best_checkpoint = Some((checkpoint_path, metadata));
+                        }
+                    }
+                }
+            }
+
+            if let Some((path, metadata)) = best_checkpoint {
+                self.best_value = metadata.best_value;
+
+                if self.verbose {
+                    println!(
+                        "Resuming from checkpoint: {} (best {} = {})",
+                        path.display(),
+                        self.monitor,
+                        self.best_value
+                    );
+                }
+
+                // Save format and weights_only flag before borrowing model
+                let save_format = self.save_format;
+                let save_weights_only = self.save_weights_only;
+
+                if let Some(model) = self.model.as_mut() {
+                    let model_path = path.with_extension(match save_format {
+                        SaveFormat::Json => "json",
+                        SaveFormat::Binary => "bin",
+                    });
+
+                    if model_path.exists() {
+                        if save_weights_only {
+                            load_weights_inner(model.as_mut(), &model_path, save_format)?;
+                        } else {
+                            load_model_inner(model.as_mut(), &model_path, save_format)?;
+                        }
+                    }
+                }
+            }
+        } else if self.verbose {
+            println!("No existing checkpoints found, starting from scratch");
+        }
+
+        Ok(())
+    }
+
+    fn on_train_end(&mut self, logs: &HashMap<String, f32>) -> Result<(), BellandeError> {
+        if let Some(&final_value) = logs.get(&self.monitor) {
+            let filepath = PathBuf::from(
+                self.filepath
+                    .replace("{epoch}", "final")
+                    .replace("{val}", &format!("{:.4}", final_value)),
+            );
+
+            let metadata = CheckpointMetadata {
+                epoch: usize::MAX,
+                best_value: self.best_value,
+                monitor: self.monitor.clone(),
+                mode: self.mode,
+                metrics: logs.clone(),
+            };
+
+            // Save format and weights_only flag before borrowing model
+            let save_format = self.save_format;
+            let save_weights_only = self.save_weights_only;
+            let verbose = self.verbose;
+
+            if let Some(model) = self.model.as_ref() {
+                if save_weights_only {
+                    save_model_weights(model.as_ref(), &filepath, save_format)?;
+                } else {
+                    save_model_state(model.as_ref(), &filepath, save_format)?;
+                }
+
+                let metadata_path = filepath.with_extension("meta.json");
+                let file = File::create(metadata_path).map_err(|e| {
+                    BellandeError::RuntimeError(format!(
+                        "Failed to create final metadata file: {}",
+                        e
+                    ))
+                })?;
+
+                serde_json::to_writer_pretty(file, &metadata).map_err(|e| {
+                    BellandeError::RuntimeError(format!("Failed to write final metadata: {}", e))
+                })?;
+
+                if verbose {
+                    println!(
+                        "Saved final checkpoint to {} (best {} = {})",
+                        filepath.display(),
+                        self.monitor,
+                        self.best_value
+                    );
+                }
+
+                // Clean up old checkpoints if configured
+                if let Some(keep_best_n) = self.keep_best_n {
+                    self.cleanup_old_checkpoints(keep_best_n)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/training/history.rs b/src/training/history.rs
new file mode 100644
index 0000000..8fcbe6c
--- /dev/null
+++ b/src/training/history.rs
@@ -0,0 +1,62 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::error::BellandeError;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::fs;
+
+#[derive(Default, Clone, Serialize, Deserialize)]
+pub struct TrainingHistory {
+    pub epochs: Vec<usize>,
+    pub metrics: HashMap<String, Vec<f32>>,
+}
+
+impl TrainingHistory {
+    pub fn new() -> Self {
+        TrainingHistory {
+            epochs: Vec::new(),
+            metrics: HashMap::new(),
+        }
+    }
+
+    pub fn update(&mut self, epoch: usize, metrics: HashMap<String, f32>) {
+        self.epochs.push(epoch);
+        for (key, value) in metrics {
+            self.metrics.entry(key).or_insert_with(Vec::new).push(value);
+        }
+    }
+
+    pub fn get_metric(&self, name: &str) -> Option<&Vec<f32>> {
+        self.metrics.get(name)
+    }
+
+    pub fn save(&self, path: &str) -> Result<(), BellandeError> {
+        let json = serde_json::to_string(self)
+            .map_err(|e| BellandeError::RuntimeError(format!("Serialization failed: {}", e)))?;
+
+        fs::write(path, json).map_err(|e| BellandeError::IOError(format!("Error: {}", e)))?;
+
+        Ok(())
+    }
+
+    pub fn load(path: &str) -> Result<Self, BellandeError> {
+        let json = fs::read_to_string(path)
+            .map_err(|e| BellandeError::IOError(format!("Error: {}", e)))?;
+
+        serde_json::from_str(&json)
+            .map_err(|e| BellandeError::RuntimeError(format!("Deserialization failed: {}", e)))
+    }
+}
diff --git a/src/training/mod.rs b/src/training/mod.rs
new file mode 100644
index 0000000..acd3f96
--- /dev/null
+++ b/src/training/mod.rs
@@ -0,0 +1,5 @@
+pub mod callbacks;
+pub mod checkpoint;
+pub mod history;
+pub mod trainer;
+pub mod validator;
diff --git a/src/training/trainer.rs b/src/training/trainer.rs
new file mode 100644
index 0000000..0040d3b
--- /dev/null
+++ b/src/training/trainer.rs
@@ -0,0 +1,264 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, error::BellandeError};
+use crate::data::dataloader::DataLoader;
+use crate::models::models::Model;
+use crate::training::{callbacks::Callback, history::TrainingHistory, validator::CallbackEvent};
+
+use crate::loss::{
+    bce::{BCELoss, Reduction},
+    cross_entropy::CrossEntropyLoss,
+    mse::MSELoss,
+    Loss,
+};
+
+use crate::optim::{adam::Adam, rmsprop::RMSprop, scheduler::LRScheduler, sgd::SGD, Optimizer};
+use std::collections::HashMap;
+
+#[derive(Default)]
+pub struct RunningMetrics {
+    metrics: HashMap<String, (f32, usize)>,
+}
+
+impl RunningMetrics {
+    pub fn new() -> Self {
+        Self {
+            metrics: HashMap::new(),
+        }
+    }
+
+    pub fn update(&mut self, name: &str, value: f32) {
+        let entry = self.metrics.entry(name.to_string()).or_insert((0.0, 0));
+        entry.0 += value;
+        entry.1 += 1;
+    }
+
+    pub fn get_average(&self) -> HashMap<String, f32> {
+        self.metrics
+            .iter()
+            .map(|(k, (sum, count))| (k.clone(), sum / *count as f32))
+            .collect()
+    }
+
+    pub fn get_current(&self) -> HashMap<String, f32> {
+        self.get_average()
+    }
+}
+
+pub struct Trainer {
+    model: Box<dyn Model>,
+    optimizer: Box<dyn Optimizer>,
+    loss_fn: Box<dyn Loss>,
+    device: Device,
+    callbacks: Vec<Box<dyn Callback>>,
+    history: TrainingHistory,
+    scheduler: Option<Box<dyn LRScheduler>>,
+}
+
+impl Trainer {
+    pub fn new(
+        model: Box<dyn Model>,
+        optimizer: Box<dyn Optimizer>,
+        loss_fn: Box<dyn Loss>,
+        device: Device,
+    ) -> Self {
+        Trainer {
+            model,
+            optimizer,
+            loss_fn,
+            device,
+            callbacks: Vec::new(),
+            history: TrainingHistory::new(),
+            scheduler: None,
+        }
+    }
+
+    pub fn new_with_adam(
+        model: Box<dyn Model>,
+        learning_rate: f32,
+        device: Device,
+    ) -> Result<Self, BellandeError> {
+        let loss_fn = Box::new(MSELoss::new(Reduction::Mean));
+        let optimizer = Box::new(Adam::new(
+            model.parameters(),
+            learning_rate,
+            (0.9, 0.999),
+            1e-8,
+            0.0,
+        ));
+
+        Ok(Self::new(model, optimizer, loss_fn, device))
+    }
+
+    pub fn new_with_sgd(
+        model: Box<dyn Model>,
+        learning_rate: f32,
+        momentum: f32,
+        device: Device,
+    ) -> Result<Self, BellandeError> {
+        let loss_fn = Box::new(CrossEntropyLoss::new(Reduction::Mean, None, None));
+        let optimizer = Box::new(SGD::new(
+            model.parameters(),
+            learning_rate,
+            momentum,
+            0.0,
+            false,
+        ));
+
+        Ok(Self::new(model, optimizer, loss_fn, device))
+    }
+
+    pub fn new_with_rmsprop(
+        model: Box<dyn Model>,
+        learning_rate: f32,
+        alpha: f32,
+        device: Device,
+    ) -> Result<Self, BellandeError> {
+        let loss_fn = Box::new(BCELoss::new(Reduction::Mean, None));
+        let optimizer = Box::new(RMSprop::new(
+            model.parameters(),
+            learning_rate,
+            alpha,
+            1e-8,
+            0.0,
+            0.0,
+            false,
+        ));
+
+        Ok(Self::new(model, optimizer, loss_fn, device))
+    }
+
+    pub fn add_scheduler(&mut self, scheduler: Box<dyn LRScheduler>) {
+        self.scheduler = Some(scheduler);
+    }
+
+    pub fn add_callback(&mut self, callback: Box<dyn Callback>) {
+        self.callbacks.push(callback);
+    }
+
+    pub fn fit(
+        &mut self,
+        mut train_loader: DataLoader,
+        mut val_loader: Option<DataLoader>,
+        epochs: usize,
+    ) -> Result<TrainingHistory, BellandeError> {
+        let mut logs = HashMap::new();
+        self.call_callbacks(CallbackEvent::TrainBegin, &logs)?;
+
+        for epoch in 0..epochs {
+            logs.clear();
+            logs.insert("epoch".to_string(), epoch as f32);
+            self.call_callbacks(CallbackEvent::EpochBegin, &logs)?;
+
+            self.model.train();
+            let train_metrics = self.train_epoch(&mut train_loader, epoch)?;
+            logs.extend(train_metrics);
+
+            if let Some(ref mut val_loader) = val_loader {
+                self.model.eval();
+                let val_metrics = self.validate(val_loader)?;
+                logs.extend(
+                    val_metrics
+                        .into_iter()
+                        .map(|(k, v)| (format!("val_{}", k), v)),
+                );
+            }
+
+            if let Some(scheduler) = &mut self.scheduler {
+                scheduler.step();
+            }
+
+            self.history.update(epoch, logs.clone());
+            self.call_callbacks(CallbackEvent::EpochEnd, &logs)?;
+        }
+
+        self.call_callbacks(CallbackEvent::TrainEnd, &logs)?;
+        Ok(self.history.clone())
+    }
+
+    fn train_epoch(
+        &mut self,
+        train_loader: &mut DataLoader,
+        _epoch: usize,
+    ) -> Result<HashMap<String, f32>, BellandeError> {
+        let mut metrics = RunningMetrics::new();
+
+        for batch in train_loader.iter() {
+            let (data, target) = batch?;
+            let batch_logs = HashMap::new();
+            self.call_callbacks(CallbackEvent::BatchBegin, &batch_logs)?;
+
+            let data = data.to_device(&self.device)?;
+            let target = target.to_device(&self.device)?;
+
+            // Forward pass
+            let mut output = self.model.forward(&data)?;
+            let loss = self.loss_fn.forward(&output, &target)?;
+
+            // Backward pass
+            self.optimizer.zero_grad();
+            output.backward()?;
+            self.optimizer.step()?;
+
+            metrics.update("loss", loss.data[0]);
+
+            let batch_logs = metrics.get_current();
+            self.call_callbacks(CallbackEvent::BatchEnd, &batch_logs)?;
+        }
+
+        Ok(metrics.get_average())
+    }
+
+    fn validate(
+        &mut self,
+        val_loader: &mut DataLoader,
+    ) -> Result<HashMap<String, f32>, BellandeError> {
+        let mut metrics = RunningMetrics::new();
+
+        for batch in val_loader.iter() {
+            let (data, target) = batch?;
+            let data = data.to_device(&self.device)?;
+            let target = target.to_device(&self.device)?;
+            let output = self.model.forward(&data)?;
+            let loss = self.loss_fn.forward(&output, &target)?;
+            metrics.update("loss", loss.data[0]);
+        }
+
+        Ok(metrics.get_average())
+    }
+
+    fn call_callbacks(
+        &mut self,
+        event: CallbackEvent,
+        logs: &HashMap<String, f32>,
+    ) -> Result<(), BellandeError> {
+        for callback in &mut self.callbacks {
+            match event {
+                CallbackEvent::TrainBegin => callback.on_train_begin(logs)?,
+                CallbackEvent::TrainEnd => callback.on_train_end(logs)?,
+                CallbackEvent::EpochBegin => {
+                    callback.on_epoch_begin(logs.get("epoch").unwrap().clone() as usize, logs)?
+                }
+                CallbackEvent::EpochEnd => {
+                    callback.on_epoch_end(logs.get("epoch").unwrap().clone() as usize, logs)?
+                }
+                CallbackEvent::BatchBegin => callback.on_batch_begin(0, logs)?,
+                CallbackEvent::BatchEnd => callback.on_batch_end(0, logs)?,
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/src/training/validator.rs b/src/training/validator.rs
new file mode 100644
index 0000000..dfa0ae0
--- /dev/null
+++ b/src/training/validator.rs
@@ -0,0 +1,80 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::core::{device::Device, error::BellandeError};
+use crate::data::dataloader::DataLoader;
+use crate::metrics::metrics::Metric;
+use crate::models::models::Model;
+use std::collections::HashMap;
+
+pub struct Validator {
+    model: Box<dyn Model>,
+    metrics: Vec<Box<dyn Metric>>,
+    device: Device,
+}
+
+impl Validator {
+    pub fn new(model: Box<dyn Model>, metrics: Vec<Box<dyn Metric>>, device: Device) -> Self {
+        Validator {
+            model,
+            metrics,
+            device,
+        }
+    }
+
+    pub fn validate(
+        &mut self,
+        val_loader: &mut DataLoader,
+    ) -> Result<HashMap<String, f32>, BellandeError> {
+        self.model.eval();
+
+        // Reset all metrics at the start of validation
+        for metric in &mut self.metrics {
+            metric.reset();
+        }
+
+        for batch in val_loader.iter() {
+            let (data, target) = batch?;
+
+            // Move data to device
+            let data = data.to_device(&self.device)?;
+            let target = target.to_device(&self.device)?;
+
+            let output = self.model.forward(&data)?;
+
+            // Update each metric with the current batch
+            for metric in &mut self.metrics {
+                metric.update(&output, &target);
+            }
+        }
+
+        // Compute final metrics
+        let mut results = HashMap::new();
+        for metric in &self.metrics {
+            results.insert(metric.name().to_string(), metric.compute());
+        }
+
+        Ok(results)
+    }
+}
+
+pub enum CallbackEvent {
+    TrainBegin,
+    TrainEnd,
+    EpochBegin,
+    EpochEnd,
+    BatchBegin,
+    BatchEnd,
+}
diff --git a/src/utilities/byte.rs b/src/utilities/byte.rs
new file mode 100644
index 0000000..8e6183a
--- /dev/null
+++ b/src/utilities/byte.rs
@@ -0,0 +1,74 @@
+// Copyright (C) 2025 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::io::{self, Read};
+
+pub trait ReadBytes: Read {
+    #[inline]
+    fn read_u8(&mut self) -> io::Result<u8> {
+        let mut buf = [0; 1];
+        self.read_exact(&mut buf)?;
+        Ok(buf[0])
+    }
+
+    fn read_u16<T: Byte>(&mut self) -> io::Result<u16> {
+        let mut buf = [0; 2];
+        self.read_exact(&mut buf)?;
+        Ok(T::read_u16(&buf))
+    }
+
+    fn read_u32<T: Byte>(&mut self) -> io::Result<u32> {
+        let mut buf = [0; 4];
+        self.read_exact(&mut buf)?;
+        Ok(T::read_u32(&buf))
+    }
+}
+
+impl<R: Read + ?Sized> ReadBytes for R {}
+
+pub trait Byte {
+    fn read_u16(buf: &[u8]) -> u16;
+    fn read_u32(buf: &[u8]) -> u32;
+    fn write_u16(buf: &mut [u8], n: u16);
+    fn write_u32(buf: &mut [u8], n: u32);
+}
+
+pub enum BigEndian {}
+
+impl Byte for BigEndian {
+    #[inline]
+    fn read_u16(buf: &[u8]) -> u16 {
+        ((buf[0] as u16) << 8) | (buf[1] as u16)
+    }
+
+    #[inline]
+    fn read_u32(buf: &[u8]) -> u32 {
+        ((buf[0] as u32) << 24) | ((buf[1] as u32) << 16) | ((buf[2] as u32) << 8) | (buf[3] as u32)
+    }
+
+    #[inline]
+    fn write_u16(buf: &mut [u8], n: u16) {
+        buf[0] = (n >> 8) as u8;
+        buf[1] = n as u8;
+    }
+
+    #[inline]
+    fn write_u32(buf: &mut [u8], n: u32) {
+        buf[0] = (n >> 24) as u8;
+        buf[1] = (n >> 16) as u8;
+        buf[2] = (n >> 8) as u8;
+        buf[3] = n as u8;
+    }
+}
diff --git a/src/utilities/compression.rs b/src/utilities/compression.rs
new file mode 100644
index 0000000..ac156e4
--- /dev/null
+++ b/src/utilities/compression.rs
@@ -0,0 +1,528 @@
+// Copyright (C) 2025 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::cmp::min;
+use std::io::{self, Read};
+
+const WINDOW_SIZE: usize = 32768;
+const WINDOW_MASK: usize = WINDOW_SIZE - 1;
+const MAX_BITS: usize = 15;
+const END_BLOCK: u16 = 256;
+
+// Huffman code lengths for fixed literal/length tree
+const FIXED_LITERAL_LENGTHS: &[u8] = &[
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 0-15
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 16-31
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 32-47
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 48-63
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 64-79
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 80-95
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 96-111
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 112-127
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, // 128-143
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 144-159
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 160-175
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 176-191
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 192-207
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 208-223
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 224-239
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 240-255
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 256-271
+    7, 7, 7, 7, 7, 7, 7, 7, // 272-279
+    8, 8, 8, 8, 8, 8, 8, 8, // 280-287
+];
+
+// Length and distance base values and extra bits
+const LENGTH_CODES: &[(u16, u8)] = &[
+    (3, 0),
+    (4, 0),
+    (5, 0),
+    (6, 0),
+    (7, 0),
+    (8, 0),
+    (9, 0),
+    (10, 0),
+    (11, 1),
+    (13, 1),
+    (15, 1),
+    (17, 1),
+    (19, 2),
+    (23, 2),
+    (27, 2),
+    (31, 2),
+    (35, 3),
+    (43, 3),
+    (51, 3),
+    (59, 3),
+    (67, 4),
+    (83, 4),
+    (99, 4),
+    (115, 4),
+    (131, 5),
+    (163, 5),
+    (195, 5),
+    (227, 5),
+    (258, 0),
+];
+
+const DISTANCE_CODES: &[(u16, u8)] = &[
+    (1, 0),
+    (2, 0),
+    (3, 0),
+    (4, 0),
+    (5, 1),
+    (7, 1),
+    (9, 2),
+    (13, 2),
+    (17, 3),
+    (25, 3),
+    (33, 4),
+    (49, 4),
+    (65, 5),
+    (97, 5),
+    (129, 6),
+    (193, 6),
+    (257, 7),
+    (385, 7),
+    (513, 8),
+    (769, 8),
+    (1025, 9),
+    (1537, 9),
+    (2049, 10),
+    (3073, 10),
+    (4097, 11),
+    (6145, 11),
+    (8193, 12),
+    (12289, 12),
+    (16385, 13),
+    (24577, 13),
+];
+
+#[derive(Clone)]
+struct HuffmanTree {
+    counts: Vec<u16>,
+    symbols: Vec<u16>,
+    min_code: Vec<u16>,
+    max_code: Vec<u16>,
+}
+
+impl HuffmanTree {
+    fn new() -> Self {
+        HuffmanTree {
+            counts: vec![0; MAX_BITS + 1],
+            symbols: Vec::new(),
+            min_code: vec![0; MAX_BITS + 1],
+            max_code: vec![0; MAX_BITS + 1],
+        }
+    }
+
+    fn build_from_lengths(&mut self, lengths: &[u8], max_symbol: usize) -> io::Result<()> {
+        // Count the number of codes for each code length
+        self.counts.fill(0);
+        for &len in lengths.iter().take(max_symbol) {
+            if len as usize > MAX_BITS {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    "Invalid code length",
+                ));
+            }
+            self.counts[len as usize] += 1;
+        }
+
+        // Compute first code value for each code length
+        let mut code = 0;
+        self.min_code[0] = 0;
+        self.max_code[0] = 0;
+        for bits in 1..=MAX_BITS {
+            code = (code + self.counts[bits - 1]) << 1;
+            self.min_code[bits] = code;
+            self.max_code[bits] = code + self.counts[bits] - 1;
+        }
+
+        // Assign symbols to codes
+        self.symbols = vec![0; max_symbol];
+        let mut symbol_index = 0;
+        for bits in 1..=MAX_BITS {
+            for symbol in 0..max_symbol {
+                if lengths[symbol] as usize == bits {
+                    self.symbols[symbol_index] = symbol as u16;
+                    symbol_index += 1;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn decode_symbol<R: Read>(
+        &self,
+        reader: &mut R,
+        bit_reader: &mut BitReader,
+    ) -> io::Result<u16> {
+        let mut len = 1;
+        let mut code = 0;
+
+        while len <= MAX_BITS {
+            code = (code << 1) | if bit_reader.read_bit(reader)? { 1 } else { 0 };
+
+            if code <= self.max_code[len] {
+                let index = (code - self.min_code[len]) as usize;
+                if index < self.symbols.len() {
+                    return Ok(self.symbols[index]);
+                }
+            }
+            len += 1;
+        }
+
+        Err(io::Error::new(
+            io::ErrorKind::InvalidData,
+            "Invalid Huffman code",
+        ))
+    }
+}
+
+struct BitReader {
+    bit_buffer: u32,
+    bits_in_buffer: u8,
+}
+
+impl BitReader {
+    fn new() -> Self {
+        BitReader {
+            bit_buffer: 0,
+            bits_in_buffer: 0,
+        }
+    }
+
+    fn read_bit<R: Read>(&mut self, reader: &mut R) -> io::Result<bool> {
+        if self.bits_in_buffer == 0 {
+            let mut byte = [0u8; 1];
+            reader.read_exact(&mut byte)?;
+            self.bit_buffer = byte[0] as u32;
+            self.bits_in_buffer = 8;
+        }
+        let bit = self.bit_buffer & 1 == 1;
+        self.bit_buffer >>= 1;
+        self.bits_in_buffer -= 1;
+        Ok(bit)
+    }
+
+    fn read_bits<R: Read>(&mut self, reader: &mut R, mut count: u8) -> io::Result<u32> {
+        let mut result = 0;
+        let mut bits_read = 0;
+
+        while bits_read < count {
+            if self.bits_in_buffer == 0 {
+                let mut byte = [0u8; 1];
+                reader.read_exact(&mut byte)?;
+                self.bit_buffer = byte[0] as u32;
+                self.bits_in_buffer = 8;
+            }
+
+            let bits_to_take = min(count - bits_read, self.bits_in_buffer);
+            let mask = (1 << bits_to_take) - 1;
+            result |= ((self.bit_buffer & mask) << bits_read) as u32;
+
+            self.bit_buffer >>= bits_to_take;
+            self.bits_in_buffer -= bits_to_take;
+            bits_read += bits_to_take;
+        }
+
+        Ok(result)
+    }
+}
+
+pub struct Decoder<R> {
+    inner: R,
+    window: Vec<u8>,
+    window_pos: usize,
+    output_buffer: Vec<u8>,
+    output_pos: usize,
+    literal_tree: HuffmanTree,
+    distance_tree: HuffmanTree,
+    bit_reader: BitReader,
+}
+
+impl<R: Read> Decoder<R> {
+    pub fn new(inner: R) -> Self {
+        Decoder {
+            inner,
+            window: vec![0; WINDOW_SIZE],
+            window_pos: 0,
+            output_buffer: Vec::new(),
+            output_pos: 0,
+            literal_tree: HuffmanTree::new(),
+            distance_tree: HuffmanTree::new(),
+            bit_reader: BitReader::new(),
+        }
+    }
+
+    fn read_header(&mut self) -> io::Result<()> {
+        let mut header = [0u8; 2];
+        self.inner.read_exact(&mut header)?;
+
+        let cmf = header[0];
+        let flg = header[1];
+
+        if (cmf & 0x0F) != 8 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid compression method",
+            ));
+        }
+
+        if (((cmf as u16) << 8) | flg as u16) % 31 != 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid header checksum",
+            ));
+        }
+
+        Ok(())
+    }
+
+    fn decode_literal_symbol(&mut self) -> io::Result<u16> {
+        self.literal_tree
+            .decode_symbol(&mut self.inner, &mut self.bit_reader)
+    }
+
+    fn decode_distance_symbol(&mut self) -> io::Result<u16> {
+        self.distance_tree
+            .decode_symbol(&mut self.inner, &mut self.bit_reader)
+    }
+
+    fn process_block(&mut self) -> io::Result<bool> {
+        let is_final = self.bit_reader.read_bit(&mut self.inner)?;
+        let block_type = self.bit_reader.read_bits(&mut self.inner, 2)? as u8;
+
+        match block_type {
+            0 => self.decode_uncompressed_block()?,
+            1 => {
+                self.literal_tree
+                    .build_from_lengths(FIXED_LITERAL_LENGTHS, 288)?;
+                let distance_lengths = vec![5u8; 32];
+                self.distance_tree
+                    .build_from_lengths(&distance_lengths, 32)?;
+                self.process_huffman_block()?;
+            }
+            2 => {
+                self.decode_dynamic_huffman_block()?;
+                self.process_huffman_block()?;
+            }
+            _ => {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    "Invalid block type",
+                ))
+            }
+        }
+
+        Ok(is_final)
+    }
+
+    fn decode_dynamic_huffman_block(&mut self) -> io::Result<()> {
+        let hlit = self.bit_reader.read_bits(&mut self.inner, 5)? as usize + 257;
+        let hdist = self.bit_reader.read_bits(&mut self.inner, 5)? as usize + 1;
+        let hclen = self.bit_reader.read_bits(&mut self.inner, 4)? as usize + 4;
+
+        let cl_index = [
+            16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
+        ];
+        let mut cl_lengths = vec![0u8; 19];
+        for i in 0..hclen {
+            cl_lengths[cl_index[i]] = self.bit_reader.read_bits(&mut self.inner, 3)? as u8;
+        }
+
+        let mut code_length_tree = HuffmanTree::new();
+        code_length_tree.build_from_lengths(&cl_lengths, 19)?;
+
+        let mut lengths = Vec::with_capacity(hlit + hdist);
+        while lengths.len() < hlit + hdist {
+            let symbol = code_length_tree.decode_symbol(&mut self.inner, &mut self.bit_reader)?;
+            match symbol {
+                0..=15 => lengths.push(symbol as u8),
+                16 => {
+                    if lengths.is_empty() {
+                        return Err(io::Error::new(
+                            io::ErrorKind::InvalidData,
+                            "Invalid code lengths",
+                        ));
+                    }
+                    let repeat = self.bit_reader.read_bits(&mut self.inner, 2)? as usize + 3;
+                    let value = *lengths.last().unwrap();
+                    lengths.extend(std::iter::repeat(value).take(repeat));
+                }
+                17 => {
+                    let repeat = self.bit_reader.read_bits(&mut self.inner, 3)? as usize + 3;
+                    lengths.extend(std::iter::repeat(0u8).take(repeat));
+                }
+                18 => {
+                    let repeat = self.bit_reader.read_bits(&mut self.inner, 7)? as usize + 11;
+                    lengths.extend(std::iter::repeat(0u8).take(repeat));
+                }
+                _ => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        "Invalid code length code",
+                    ))
+                }
+            }
+        }
+
+        let (literal_lengths, distance_lengths) = lengths.split_at(hlit);
+        self.literal_tree
+            .build_from_lengths(literal_lengths, hlit)?;
+        self.distance_tree
+            .build_from_lengths(distance_lengths, hdist)?;
+
+        Ok(())
+    }
+
+    fn process_huffman_block(&mut self) -> io::Result<()> {
+        loop {
+            let symbol = self
+                .literal_tree
+                .decode_symbol(&mut self.inner, &mut self.bit_reader)?;
+
+            if symbol == END_BLOCK {
+                break;
+            }
+
+            if symbol < 256 {
+                // Literal byte
+                self.window[self.window_pos] = symbol as u8;
+                self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
+                self.output_buffer.push(symbol as u8);
+            } else {
+                // Length/distance pair
+                let length = self.decode_length(symbol as usize - 257)?;
+                let distance_code = self.decode_distance_symbol()?;
+                let distance = self.decode_distance(distance_code as usize)?;
+
+                if distance > self.window_pos {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        "Invalid distance",
+                    ));
+                }
+
+                let start_pos = (self.window_pos - distance) & WINDOW_MASK;
+                for i in 0..length {
+                    let byte = self.window[(start_pos + i) & WINDOW_MASK];
+                    self.window[self.window_pos] = byte;
+                    self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
+                    self.output_buffer.push(byte);
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn decode_length(&mut self, code: usize) -> io::Result<usize> {
+        if code >= LENGTH_CODES.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid length code",
+            ));
+        }
+
+        let (base, extra) = LENGTH_CODES[code];
+        let extra_bits = if extra > 0 {
+            self.bit_reader.read_bits(&mut self.inner, extra)? as usize
+        } else {
+            0
+        };
+
+        Ok(base as usize + extra_bits)
+    }
+
+    fn decode_distance(&mut self, code: usize) -> io::Result<usize> {
+        if code >= DISTANCE_CODES.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid distance code",
+            ));
+        }
+
+        let (base, extra) = DISTANCE_CODES[code];
+        let extra_bits = if extra > 0 {
+            self.bit_reader.read_bits(&mut self.inner, extra)? as usize
+        } else {
+            0
+        };
+
+        Ok(base as usize + extra_bits)
+    }
+
+    fn decode_uncompressed_block(&mut self) -> io::Result<()> {
+        // Reset bit buffer since we'll be reading byte-aligned data
+        self.bit_reader.bits_in_buffer = 0;
+
+        let mut header = [0u8; 4];
+        self.inner.read_exact(&mut header)?;
+
+        let len = u16::from_le_bytes([header[0], header[1]]);
+        let nlen = u16::from_le_bytes([header[2], header[3]]);
+
+        if len != !nlen {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Invalid block length",
+            ));
+        }
+
+        let mut buffer = vec![0; len as usize];
+        self.inner.read_exact(&mut buffer)?;
+
+        for &byte in &buffer {
+            self.window[self.window_pos] = byte;
+            self.window_pos = (self.window_pos + 1) & WINDOW_MASK;
+            self.output_buffer.push(byte);
+        }
+
+        Ok(())
+    }
+}
+
+impl<R: Read> Read for Decoder<R> {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        if self.window_pos == 0 {
+            self.read_header()?;
+        }
+
+        if self.output_pos < self.output_buffer.len() {
+            let remaining = self.output_buffer.len() - self.output_pos;
+            let to_copy = min(remaining, buf.len());
+            buf[..to_copy]
+                .copy_from_slice(&self.output_buffer[self.output_pos..self.output_pos + to_copy]);
+            self.output_pos += to_copy;
+            return Ok(to_copy);
+        }
+
+        self.output_pos = 0;
+        self.output_buffer.clear();
+
+        let is_final = self.process_block()?;
+
+        if self.output_pos < self.output_buffer.len() {
+            self.read(buf)
+        } else if is_final {
+            Ok(0)
+        } else {
+            self.read(buf)
+        }
+    }
+}
diff --git a/src/utilities/config.rs b/src/utilities/config.rs
new file mode 100644
index 0000000..b305cc7
--- /dev/null
+++ b/src/utilities/config.rs
@@ -0,0 +1,245 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::error::Error;
+use std::fs;
+use std::path::Path;
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct Configuration {
+    // Training configuration
+    pub batch_size: usize,
+    pub epochs: usize,
+    pub learning_rate: f32,
+    pub optimizer: OptimizerConfig,
+
+    // Model configuration
+    pub model: ModelConfig,
+
+    // Data configuration
+    pub data: DataConfig,
+
+    // System configuration
+    pub system: SystemConfig,
+
+    // Custom parameters
+    pub parameters: HashMap<String, String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct OptimizerConfig {
+    pub name: String,
+    pub momentum: Option<f32>,
+    pub beta1: Option<f32>,
+    pub beta2: Option<f32>,
+    pub weight_decay: Option<f32>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ModelConfig {
+    pub architecture: String,
+    pub input_shape: Vec<usize>,
+    pub num_classes: usize,
+    pub hidden_layers: Vec<usize>,
+    pub dropout_rate: Option<f32>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct DataConfig {
+    pub train_path: String,
+    pub val_path: Option<String>,
+    pub test_path: Option<String>,
+    pub augmentation: bool,
+    pub normalize: bool,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct SystemConfig {
+    pub num_workers: usize,
+    pub device: String,
+    pub precision: String,
+    pub seed: Option<u64>,
+}
+
+impl Default for OptimizerConfig {
+    fn default() -> Self {
+        OptimizerConfig {
+            name: "adam".to_string(),
+            momentum: Some(0.9),
+            beta1: Some(0.9),
+            beta2: Some(0.999),
+            weight_decay: Some(0.0),
+        }
+    }
+}
+
+impl Default for ModelConfig {
+    fn default() -> Self {
+        ModelConfig {
+            architecture: "mlp".to_string(),
+            input_shape: vec![784], // Default for MNIST-like data
+            num_classes: 10,
+            hidden_layers: vec![512, 256],
+            dropout_rate: Some(0.5),
+        }
+    }
+}
+
+impl Default for DataConfig {
+    fn default() -> Self {
+        DataConfig {
+            train_path: "data/train".to_string(),
+            val_path: Some("data/val".to_string()),
+            test_path: Some("data/test".to_string()),
+            augmentation: false,
+            normalize: true,
+        }
+    }
+}
+
+impl Default for SystemConfig {
+    fn default() -> Self {
+        SystemConfig {
+            num_workers: num_cpus::get(),
+            device: "cpu".to_string(),
+            precision: "float32".to_string(),
+            seed: None,
+        }
+    }
+}
+
+impl Default for Configuration {
+    fn default() -> Self {
+        Configuration {
+            batch_size: 32,
+            epochs: 10,
+            learning_rate: 0.001,
+            optimizer: OptimizerConfig::default(),
+            model: ModelConfig::default(),
+            data: DataConfig::default(),
+            system: SystemConfig::default(),
+            parameters: HashMap::new(),
+        }
+    }
+}
+
+impl Configuration {
+    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, Box<dyn Error>> {
+        let content = fs::read_to_string(path)?;
+        let config: Configuration = serde_yaml::from_str(&content)?;
+
+        if let Err(validation_error) = config.validate() {
+            return Err(Box::new(std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                validation_error,
+            )));
+        }
+
+        Ok(config)
+    }
+
+    pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<dyn Error>> {
+        let content = serde_yaml::to_string(self)?;
+        fs::write(path, content)?;
+        Ok(())
+    }
+
+    pub fn validate(&self) -> Result<(), String> {
+        // Validate batch size
+        if self.batch_size == 0 {
+            return Err("Batch size must be greater than 0".to_string());
+        }
+
+        // Validate learning rate
+        if self.learning_rate <= 0.0 {
+            return Err("Learning rate must be positive".to_string());
+        }
+
+        // Validate model configuration
+        if self.model.input_shape.is_empty() {
+            return Err("Input shape cannot be empty".to_string());
+        }
+
+        if self.model.num_classes == 0 {
+            return Err("Number of classes must be greater than 0".to_string());
+        }
+
+        // Validate optimizer configuration
+        if let Some(momentum) = self.optimizer.momentum {
+            if !(0.0..=1.0).contains(&momentum) {
+                return Err("Momentum must be between 0 and 1".to_string());
+            }
+        }
+
+        if let Some(beta1) = self.optimizer.beta1 {
+            if !(0.0..=1.0).contains(&beta1) {
+                return Err("Beta1 must be between 0 and 1".to_string());
+            }
+        }
+
+        if let Some(beta2) = self.optimizer.beta2 {
+            if !(0.0..=1.0).contains(&beta2) {
+                return Err("Beta2 must be between 0 and 1".to_string());
+            }
+        }
+
+        // Validate data paths
+        if !Path::new(&self.data.train_path).exists() {
+            return Err("Training data path does not exist".to_string());
+        }
+
+        if let Some(val_path) = &self.data.val_path {
+            if !Path::new(val_path).exists() {
+                return Err("Validation data path does not exist".to_string());
+            }
+        }
+
+        if let Some(test_path) = &self.data.test_path {
+            if !Path::new(test_path).exists() {
+                return Err("Test data path does not exist".to_string());
+            }
+        }
+
+        // Validate system configuration
+        if self.system.num_workers == 0 {
+            return Err("Number of workers must be greater than 0".to_string());
+        }
+
+        match self.system.precision.as_str() {
+            "float32" | "float16" | "bfloat16" => Ok(()),
+            _ => Err("Invalid precision format".to_string()),
+        }?;
+
+        Ok(())
+    }
+
+    pub fn merge(&mut self, other: &Configuration) {
+        // Merge only non-default values from other configuration
+        if other.batch_size != Configuration::default().batch_size {
+            self.batch_size = other.batch_size;
+        }
+        if other.epochs != Configuration::default().epochs {
+            self.epochs = other.epochs;
+        }
+        if other.learning_rate != Configuration::default().learning_rate {
+            self.learning_rate = other.learning_rate;
+        }
+
+        // Merge parameters
+        self.parameters.extend(other.parameters.clone());
+    }
+}
diff --git a/src/utilities/mod.rs b/src/utilities/mod.rs
new file mode 100644
index 0000000..d2c920c
--- /dev/null
+++ b/src/utilities/mod.rs
@@ -0,0 +1,6 @@
+pub mod byte;
+pub mod compression;
+pub mod config;
+pub mod profiler;
+pub mod progress;
+pub mod visualization;
diff --git a/src/utilities/profiler.rs b/src/utilities/profiler.rs
new file mode 100644
index 0000000..d846826
--- /dev/null
+++ b/src/utilities/profiler.rs
@@ -0,0 +1,92 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::collections::HashMap;
+use std::time::{Duration, Instant};
+
+pub struct Profiler {
+    timings: HashMap<String, Vec<Duration>>,
+    current_timers: HashMap<String, Instant>,
+}
+
+impl Profiler {
+    pub fn new() -> Self {
+        Profiler {
+            timings: HashMap::new(),
+            current_timers: HashMap::new(),
+        }
+    }
+
+    pub fn start(&mut self, name: &str) {
+        self.current_timers.insert(name.to_string(), Instant::now());
+    }
+
+    pub fn stop(&mut self, name: &str) {
+        if let Some(start_time) = self.current_timers.remove(name) {
+            let duration = start_time.elapsed();
+            self.timings
+                .entry(name.to_string())
+                .or_insert_with(Vec::new)
+                .push(duration);
+        }
+    }
+
+    pub fn get_statistics(&self, name: &str) -> Option<ProfileStats> {
+        self.timings.get(name).map(|durations| {
+            let total: Duration = durations.iter().sum();
+            let avg = total / durations.len() as u32;
+            let min = durations.iter().min().unwrap();
+            let max = durations.iter().max().unwrap();
+
+            ProfileStats {
+                count: durations.len(),
+                total,
+                average: avg,
+                min: *min,
+                max: *max,
+            }
+        })
+    }
+
+    pub fn reset(&mut self) {
+        self.timings.clear();
+        self.current_timers.clear();
+    }
+
+    pub fn report(&self) -> String {
+        let mut report = String::from("Performance Profile:\n");
+        for (name, stats) in self.timings.iter() {
+            let total: Duration = stats.iter().sum();
+            let avg = total / stats.len() as u32;
+            report.push_str(&format!(
+                "{}: {} calls, total={:?}, avg={:?}\n",
+                name,
+                stats.len(),
+                total,
+                avg
+            ));
+        }
+        report
+    }
+}
+
+#[derive(Debug)]
+pub struct ProfileStats {
+    pub count: usize,
+    pub total: Duration,
+    pub average: Duration,
+    pub min: Duration,
+    pub max: Duration,
+}
diff --git a/src/utilities/progress.rs b/src/utilities/progress.rs
new file mode 100644
index 0000000..dd46725
--- /dev/null
+++ b/src/utilities/progress.rs
@@ -0,0 +1,78 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::io::{stdout, Write};
+use std::time::{Duration, Instant};
+
+pub struct ProgressBar {
+    total: usize,
+    current: usize,
+    start_time: Instant,
+    last_update: Instant,
+    update_frequency: Duration,
+}
+
+impl ProgressBar {
+    pub fn new(total: usize) -> Self {
+        ProgressBar {
+            total,
+            current: 0,
+            start_time: Instant::now(),
+            last_update: Instant::now(),
+            update_frequency: Duration::from_millis(100),
+        }
+    }
+
+    pub fn update(&mut self, amount: usize) {
+        self.current += amount;
+        let now = Instant::now();
+        if now.duration_since(self.last_update) >= self.update_frequency {
+            self.render();
+            self.last_update = now;
+        }
+    }
+
+    pub fn finish(&mut self) {
+        self.current = self.total;
+        self.render();
+        println!();
+    }
+
+    fn render(&self) {
+        let progress = self.current as f32 / self.total as f32;
+        let bar_width = 50;
+        let filled = (progress * bar_width as f32) as usize;
+        let empty = bar_width - filled;
+
+        let elapsed = self.start_time.elapsed();
+        let eta = if progress > 0.0 {
+            Duration::from_secs_f32(elapsed.as_secs_f32() / progress * (1.0 - progress))
+        } else {
+            Duration::from_secs(0)
+        };
+
+        print!(
+            "\r[{}{}] {}/{} ({:.1}%) - Elapsed: {:?}, ETA: {:?}",
+            "=".repeat(filled),
+            " ".repeat(empty),
+            self.current,
+            self.total,
+            progress * 100.0,
+            elapsed,
+            eta
+        );
+        stdout().flush().unwrap();
+    }
+}
diff --git a/src/utilities/visualization.rs b/src/utilities/visualization.rs
new file mode 100644
index 0000000..13af11e
--- /dev/null
+++ b/src/utilities/visualization.rs
@@ -0,0 +1,182 @@
+// Copyright (C) 2024 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use plotters::prelude::*;
+use std::collections::HashMap;
+use std::error::Error;
+use std::path::Path;
+
+pub struct VisualizationBuilder {
+    title: String,
+    width: u32,
+    height: u32,
+    x_label: String,
+    y_label: String,
+}
+
+impl VisualizationBuilder {
+    pub fn new() -> Self {
+        VisualizationBuilder {
+            title: String::from("Plot"),
+            width: 800,
+            height: 600,
+            x_label: String::from("X"),
+            y_label: String::from("Y"),
+        }
+    }
+
+    pub fn title(mut self, title: &str) -> Self {
+        self.title = title.to_string();
+        self
+    }
+
+    pub fn size(mut self, width: u32, height: u32) -> Self {
+        self.width = width;
+        self.height = height;
+        self
+    }
+
+    pub fn labels(mut self, x_label: &str, y_label: &str) -> Self {
+        self.x_label = x_label.to_string();
+        self.y_label = y_label.to_string();
+        self
+    }
+}
+
+pub struct Visualization;
+
+impl Visualization {
+    pub fn plot_metrics<P: AsRef<Path>>(
+        history: &HashMap<String, Vec<f32>>,
+        metrics: &[&str],
+        output_path: P,
+        config: VisualizationBuilder,
+    ) -> Result<(), Box<dyn Error>> {
+        let root = BitMapBackend::new(output_path.as_ref(), (config.width, config.height))
+            .into_drawing_area();
+
+        root.fill(&WHITE)?;
+
+        let epochs: Vec<f32> = (0..history.values().next().unwrap().len())
+            .map(|x| x as f32)
+            .collect();
+
+        let min_value = metrics
+            .iter()
+            .filter_map(|&m| history.get(m))
+            .flatten()
+            .fold(f32::INFINITY, |a, &b| a.min(b));
+
+        let max_value = metrics
+            .iter()
+            .filter_map(|&m| history.get(m))
+            .flatten()
+            .fold(f32::NEG_INFINITY, |a, &b| a.max(b));
+
+        let mut chart = ChartBuilder::on(&root)
+            .caption(&config.title, ("sans-serif", 40))
+            .margin(10)
+            .x_label_area_size(40)
+            .y_label_area_size(40)
+            .build_cartesian_2d(0f32..epochs.len() as f32, min_value..max_value)?;
+
+        chart
+            .configure_mesh()
+            .x_desc(&config.x_label)
+            .y_desc(&config.y_label)
+            .draw()?;
+
+        for (idx, &metric) in metrics.iter().enumerate() {
+            if let Some(values) = history.get(metric) {
+                chart
+                    .draw_series(LineSeries::new(
+                        epochs.iter().zip(values).map(|(&x, &y)| (x, y)),
+                        &Palette99::pick(idx),
+                    ))?
+                    .label(metric)
+                    .legend(move |(x, y)| {
+                        PathElement::new(vec![(x, y), (x + 20, y)], &Palette99::pick(idx))
+                    });
+            }
+        }
+
+        chart
+            .configure_series_labels()
+            .background_style(&WHITE.mix(0.8))
+            .border_style(&BLACK)
+            .draw()?;
+
+        Ok(())
+    }
+
+    pub fn plot_confusion_matrix<P: AsRef<Path>>(
+        matrix: &Vec<Vec<usize>>,
+        labels: &[String],
+        output_path: P,
+    ) -> Result<(), Box<dyn Error>> {
+        let width = 800;
+        let height = 600;
+        let root = BitMapBackend::new(output_path.as_ref(), (width, height)).into_drawing_area();
+
+        root.fill(&WHITE)?;
+
+        let n_classes = matrix.len();
+        let max_value = matrix.iter().flatten().max().copied().unwrap_or(1);
+
+        let mut chart = ChartBuilder::on(&root)
+            .caption("Confusion Matrix", ("sans-serif", 40))
+            .margin(10)
+            .x_label_area_size(40)
+            .y_label_area_size(40)
+            .build_cartesian_2d(0f32..n_classes as f32, 0f32..n_classes as f32)?;
+
+        chart
+            .configure_mesh()
+            .disable_x_mesh()
+            .disable_y_mesh()
+            .x_desc("Predicted")
+            .y_desc("Actual")
+            .x_labels(n_classes)
+            .y_labels(n_classes)
+            .x_label_formatter(&|x| labels[x.floor() as usize].clone())
+            .y_label_formatter(&|y| labels[y.floor() as usize].clone())
+            .draw()?;
+
+        for i in 0..n_classes {
+            for j in 0..n_classes {
+                let value = matrix[i][j];
+                let color = RGBColor(
+                    255,
+                    ((1.0 - value as f64 / max_value as f64) * 255.0) as u8,
+                    ((1.0 - value as f64 / max_value as f64) * 255.0) as u8,
+                );
+
+                // Use f32 for coordinates
+                chart.draw_series(std::iter::once(Rectangle::new(
+                    [(j as f32, i as f32), ((j + 1) as f32, (i + 1) as f32)],
+                    color.filled(),
+                )))?;
+
+                chart.draw_series(std::iter::once(Text::new(
+                    value.to_string(),
+                    (j as f32 + 0.5, i as f32 + 0.5),
+                    ("sans-serif", 20).into_font(),
+                )))?;
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/tests/integration_test.rs b/tests/integration_test.rs
new file mode 100644
index 0000000..3a74a03
--- /dev/null
+++ b/tests/integration_test.rs
@@ -0,0 +1,57 @@
+// Copyright (C) 2025 Bellande Artificial Intelligence Computer Vision Research Innovation Center, Ronaldson Bellande
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use std::error::Error;
+
+use bellande_artificial_intelligence_training_framework::{
+    core::tensor::Tensor,
+    layer::{activation::ReLU, conv::Conv2d, linear::Linear, pooling::MaxPool2d},
+    models::sequential::Sequential,
+};
+
+#[test]
+fn test_single_layer() -> Result<(), Box<dyn Error>> {
+    // Create the simplest possible model
+    let mut model = Sequential::new();
+
+    // Add just one conv layer
+    model.add(Box::new(Conv2d::new(
+        3,            // in_channels
+        4,            // out_channels (reduced)
+        (3, 3),       // kernel_size
+        Some((1, 1)), // stride
+        Some((1, 1)), // padding
+        true,         // bias
+    )));
+
+    // Create tiny input
+    let input = Tensor::zeros(&[1, 3, 8, 8]); // Minimal size
+
+    // Test forward pass
+    let output = model.forward(&input)?;
+
+    // Verify output
+    assert_eq!(output.shape()[1], 4); // Check output channels
+
+    Ok(())
+}
+
+// Test tensor operations separately
+#[test]
+fn test_tensor_ops() -> Result<(), Box<dyn Error>> {
+    let tensor = Tensor::zeros(&[1, 3, 8, 8]);
+    assert_eq!(tensor.shape(), &[1, 3, 8, 8]);
+    Ok(())
+}